cpu, arm: Separate Float* from SimdFloat*, add FloatMem* opClass | Review Request

Information
Submitter:	Fernando Endo
Repository:	gem5
Branch:
Bugs:
Depends On:
Reviewers
Groups:	Default
People:

Description

Modify the opClass assigned to AArch64 FP instructions from SimdFloat* to
Float*. Also create the FloatMemRead and FloatMemWrite opClasses, which
distinguishes writes to the INT and FP register banks.
Change the latency of (Simd)FloatMultAcc to 5, based on the Cortex-A72,
where the "latency" of FMADD is 3 if the next instruction is a FMADD and
has only the augend to destination dependency, otherwise it's 7 cycles.

Testing Done

The changes from SimdFloat to Float were quickly tested, indeed the neon64.isa and fp64.isa files already separate the implementations, so it should not be buggy.

Load and store instructions tested, followed by #uops and opClass from stats.txt:
asm("ldr X2, [X30], #0"); // 1 MemRead
asm("ldp X2, X3, [X30], #0"); // 1 MemRead
asm("ldpsw X2, X3, [X30], #0"); // 1 MemRead
asm("ldr W2, [X30], #0"); // 1 MemRead
asm("ldp W2, W3, [X30], #0"); // 1 MemRead

asm("ldp Q0, Q1, [SP], #0"); // 2 FloatMemRead
asm("ldp D2, D3, [SP], #0"); // 1 FloatMemRead
asm("ldp S2, S3, [X30], #0"); // 1 FloatMemRead
asm("ldr Q4, [SP], #0"); // 1 FloatMemRead
asm("ldr Q4, [SP, X28]"); // 1 FloatMemRead
asm("ldr D5, [SP], #0"); // 1 FloatMemRead
asm("ldr D5, [SP, X28]"); // 1 FloatMemRead
asm("ldr S6, [SP], #0"); // 1 FloatMemRead
asm("ldr S2, [X30, X28]"); // 1 FloatMemRead
asm("ldr H7, [X30], #0"); // 1 FloatMemRead
asm("ldr B8, [SP], #0"); // 1 FloatMemRead

asm("ldur B0, [X30, #0]"); // 1 FloatMemRead
asm("ldur H0, [X30, #0]"); // 1 FloatMemRead
asm("ldur S0, [X30, #0]"); // 1 FloatMemRead
asm("ldur D0, [X30, #0]"); // 1 FloatMemRead
asm("ldur Q0, [X30, #0]"); // 1 FloatMemRead

asm("ld1 {V9.B}[0], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.B}[15], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.H}[0], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.H}[7], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.S}[0], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.S}[3], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.D}[0], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.D}[1], [X30]"); // 1 FloatMemRead

asm("ld2 {V9.B, V10.B}[0], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.B, V10.B}[15], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.H, V10.H}[0], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.H, V10.H}[7], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.S, V10.S}[0], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.S, V10.S}[3], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.D, V10.D}[0], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.D, V10.D}[1], [X30]"); // 1 FloatMemRead

asm("ld3 {V9.B, V10.B, V11.B}[0], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.B, V10.B, V11.B}[15], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.H, V10.H, V11.H}[0], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.H, V10.H, V11.H}[7], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.S, V10.S, V11.S}[0], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.S, V10.S, V11.S}[3], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.D, V10.D, V11.D}[0], [X30]"); // 2 FloatMemRead
asm("ld3 {V9.D, V10.D, V11.D}[1], [X30]"); // 2 FloatMemRead

asm("ld4 {V9.B, V10.B, V11.B, V12.B}[0], [X30]"); // 1 FloatMemRead
asm("ld4 {V9.B, V10.B, V11.B, V12.B}[15], [X30]"); // 1 FloatMemRead
asm("ld4 {V9.H, V10.H, V11.H, V12.H}[0], [X30]"); // 1 FloatMemRead
asm("ld4 {V9.H, V10.H, V11.H, V12.H}[7], [X30]"); // 1 FloatMemRead
asm("ld4 {V9.S, V10.S, V11.S, V12.S}[0], [X30]"); // 1 FloatMemRead
asm("ld4 {V9.S, V10.S, V11.S, V12.S}[3], [X30]"); // 1 FloatMemRead
asm("ld4 {V9.D, V10.D, V11.D, V12.D}[0], [X30]"); // 2 FloatMemRead
asm("ld4 {V9.D, V10.D, V11.D, V12.D}[1], [X30]"); // 2 FloatMemRead

asm("ld1 {V9.16B}, [X30], #16"); // 1 FloatMemRead
asm("ld1 {V9.8H, V10.8H}, [X30], #32"); // 2 FloatMemRead
asm("ld1 {V9.4S, V10.4S, V11.4S}, [X30], #48"); // 3 FloatMemRead
asm("ld1 {V9.2D, V10.2D, V11.2D, V12.2D}, [X30], #64"); // 4 FloatMemRead

asm("ld2 {V9.8H, V10.8H}, [X30], #32"); // 2 FloatMemRead
asm("ld3 {V9.4S, V10.4S, V11.4S}, [X30], #48"); // 3 FloatMemRead
asm("ld4 {V9.2D, V10.2D, V11.2D, V12.2D}, [X30], #64"); // 4 FloatMemRead

asm("ld1r {V9.16B}, [X30]"); // 1 FloatMemRead
asm("ld1r {V9.8H}, [X30]"); // 1 FloatMemRead
asm("ld1r {V9.4S}, [X30]"); // 1 FloatMemRead
asm("ld1r {V9.2D}, [X30]"); // 1 FloatMemRead

asm("ld2r {V9.16B, V10.16B}, [X30]"); // 1 FloatMemRead
asm("ld2r {V9.8H, V10.8H}, [X30]"); // 1 FloatMemRead
asm("ld2r {V9.4S, V10.4S}, [X30]"); // 1 FloatMemRead
asm("ld2r {V9.2D, V10.2D}, [X30]"); // 1 FloatMemRead

asm("ld3r {V9.16B, V10.16B, V11.16B}, [X30]"); // 1 FloatMemRead
asm("ld3r {V9.8H, V10.8H, V11.8H}, [X30]"); // 1 FloatMemRead
asm("ld3r {V9.4S, V10.4S, V11.4S}, [X30]"); // 1 FloatMemRead
asm("ld3r {V9.2D, V10.2D, V11.2D}, [X30]"); // 2 FloatMemRead

asm("ld4r {V9.16B, V10.16B, V11.16B, V12.16B}, [X30]"); // 1 FloatMemRead
asm("ld4r {V9.8H, V10.8H, V11.8H, V12.8H}, [X30]"); // 1 FloatMemRead
asm("ld4r {V9.4S, V10.4S, V11.4S, V12.4S}, [X30]"); // 1 FloatMemRead
asm("ld4r {V9.2D, V10.2D, V11.2D, V12.2D}, [X30]"); // 2 FloatMemRead

asm("str X2, [X30], #0"); // 1 MemWrite
asm("stp X2, X3, [X30], #0"); // 2 MemWrite
asm("str W2, [X30], #0"); // 1 MemWrite
asm("stp W2, W3, [X30], #0"); // 1 MemWrite

asm("stp Q0, Q1, [X29], #0"); // 4 FloatMemWrite
asm("stp D2, D3, [X29], #0"); // 2 FloatMemWrite
asm("stp S2, S3, [X29], #0"); // 1 FloatMemWrite
asm("str Q4, [X29], #0"); // 2 FloatMemWrite
asm("str D5, [X29], #0"); // 1  FloatMemWrite
asm("str S6, [X29], #0"); // 1 FloatMemWrite
asm("str H7, [X29], #0"); // 1 FloatMemWrite
asm("str B8, [X29], #0"); // 1 FloatMemWrite

asm("stur B0, [X29, #0]"); // 1 FloatMemWrite
asm("stur H0, [X29, #0]"); // 1 FloatMemWrite
asm("stur S0, [X29, #0]"); // 1 FloatMemWrite
asm("stur D0, [X29, #0]"); // 1 FloatMemWrite
asm("stur Q0, [X29, #0]"); // 2 FloatMemWrite

asm("st1 {V9.B}[0], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.B}[15], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.H}[0], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.H}[7], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.S}[0], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.S}[3], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.D}[0], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.D}[1], [X29]"); // 1 FloatMemWrite

asm("st2 {V9.B, V10.B}[0], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.B, V10.B}[15], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.H, V10.H}[0], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.H, V10.H}[7], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.S, V10.S}[0], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.S, V10.S}[3], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.D, V10.D}[0], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.D, V10.D}[1], [X29]"); // 1 FloatMemWrite

asm("st3 {V9.B, V10.B, V11.B}[0], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.B, V10.B, V11.B}[15], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.H, V10.H, V11.H}[0], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.H, V10.H, V11.H}[7], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.S, V10.S, V11.S}[0], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.S, V10.S, V11.S}[3], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.D, V10.D, V11.D}[0], [X29]"); // 2 FloatMemWrite
asm("st3 {V9.D, V10.D, V11.D}[1], [X29]"); // 2 FloatMemWrite

asm("st4 {V9.B, V10.B, V11.B, V12.B}[0], [X29]"); // 1 FloatMemWrite
asm("st4 {V9.B, V10.B, V11.B, V12.B}[15], [X29]"); // 1 FloatMemWrite
asm("st4 {V9.H, V10.H, V11.H, V12.H}[0], [X29]"); // 1 FloatMemWrite
asm("st4 {V9.H, V10.H, V11.H, V12.H}[7], [X29]"); // 1 FloatMemWrite
asm("st4 {V9.S, V10.S, V11.S, V12.S}[0], [X29]"); // 1 FloatMemWrite
asm("st4 {V9.S, V10.S, V11.S, V12.S}[3], [X29]"); // 1 FloatMemWrite
asm("st4 {V9.D, V10.D, V11.D, V12.D}[0], [X29]"); // 2 FloatMemWrite
asm("st4 {V9.D, V10.D, V11.D, V12.D}[1], [X29]"); // 2 FloatMemWrite

asm("st1 {V9.16B}, [X29], #16"); // 1 FloatMemWrite
asm("st1 {V9.8H, V10.8H}, [X29], #32"); // 2 FloatMemWrite
asm("st1 {V9.4S, V10.4S, V11.4S}, [X29], #48"); // 3 FloatMemWrite
asm("st1 {V9.2D, V10.2D, V11.2D, V12.2D}, [X29], #64"); // 4 FloatMemWrite

asm("st2 {V9.8H, V10.8H}, [X29], #32"); // 2 FloatMemWrite
asm("st3 {V9.4S, V10.4S, V11.4S}, [X29], #48"); // 3 FloatMemWrite
asm("st4 {V9.2D, V10.2D, V11.2D, V12.2D}, [X29], #64"); // 4 FloatMemWrite

Issue Summary

Description	From	Last Updated	Status
Does this change the performance at all? Is there a need for this change?	Jason Lowe-Power	July 31, 2016, 4:43 a.m.	Dropped

Thanks for this contribution, the Float/Simd split for AArch64 makes a lot of sense.
Overall the modifications look great, I only have a couple of comments:
1. I'm not sure whether MinorCPU would still work, given the additional opclasses (it might just ignore them). I'd suggest to update src/cpu/minor/MinorCPU.py as well to include the new "Float" opclasses in MinorDefaultFloatSimdFU; that should be enough to get it working, I believe.
2. I'm not too keen on the addition of FloatMem{Read/Write}. These might be useful for generating instruction distributions, but from a functional perspective all ARM loads/stores just deal with bytes and do not need to interpret their content, apart from endianness conversions (not sure about x86). I understand that in this context "Float" means that the destination is a Float register, but in my view the opclass in gem5 is mostly a way to steer instructions to functional units, but in this case plain Mem{Read/Write} and FloatMem{Read/Write} will always land on the same datapath...

Fernando Endo July 16, 2016, 9:57 a.m. (July 16, 2016, 9:57 a.m.)

Hello Giacomo,

Thanks for your suggestions, I fixed and tested the Minor CPU config.
Regarding the FloatMem* opClass, I took the gem5 spirit of having a highly configurable user interface. In my patch I purposefully put the FloatMemWrite in the same "functional unit" (i.e., execution port) as MemWrite, and idem for FloatMemRead, which is the usual. However, a user may want to have longer latency for FP loads/stores for example. Or set a separate execution port for them.

Giacomo Gabrielli July 20, 2016, 2:47 a.m. (July 20, 2016, 2:47 a.m.)
```
Overall I'm happy with the current status of the patch. Thanks!
```

Change Summary:

Upload new diff

Diff:

Revision 2 (+87 -58)

Show changes

	configs/common/O3_ARM_v7a.py
	src/arch/isa_parser.py
	src/arch/arm/isa/insts/fp64.isa
	src/cpu/FuncUnit.py
	src/cpu/op_class.hh
	src/cpu/minor/MinorCPU.py
	src/cpu/o3/FuncUnitConfig.py

Seems reasonable to me except for the comment below. It would be good if one of the ARM folks had a look at this and signed off.

Also, does this affect the regression stats? I would imagine so.

configs/common/O3_ARM_v7a.py (Diff revision 2)

Does this change the performance at all? Is there a need for this change?

Show all issues

Giacomo Gabrielli July 20, 2016, 2:43 a.m. (July 20, 2016, 2:43 a.m.)

I think there is definitely a need for this change - the single-cycle FMA looked like an oversight...
This will certainly affect the regression stats, but in a good way :)

Fernando Endo July 31, 2016, 4:50 a.m. (July 31, 2016, 4:50 a.m.)

Briefly, in the Cortex-A72, the "latency" of FMADD is 3 if the next instruction is a FMADD and has only the augend to destination dependency, otherwise its latency is 7 cycles. Averaging, we get 5.

I'm currently working on a fix to this too, it is called late-forwarding.

Jason Lowe-Power Aug. 1, 2016, 7:11 a.m. (Aug. 1, 2016, 7:11 a.m.)

Gotcha. Could you add something to the commit message about this? Also, it will make it easier to commit if you format the commit message according to the formatting guidelines: http://www.m5sim.org/Submitting_Contributions#Commit_Messages.

Thanks!

Ship It!

Ship It!

Change Summary:

Format summary and description

Summary:

-	cpu, arm: Distinguish Float* and SimdFloat, create FloatMem opClass
+	cpu, arm: Separate Float* from SimdFloat, add FloatMem opClass

Description:

~		Modify the opClass assigned to AArch64 FP instructions from SimdFloat to Float. Also create the FloatMemRead and FloatMemWrite opClasses, which distinguishes writes to the INT and FP register banks.
	~	Modify the opClass assigned to AArch64 FP instructions from SimdFloat* to
	+	Float*. Also create the FloatMemRead and FloatMemWrite opClasses, which
	+	distinguishes writes to the INT and FP register banks.
	+	Change the latency of (Simd)FloatMultAcc to 5, based on the Cortex-A72,
	+	where the "latency" of FMADD is 3 if the next instruction is a FMADD and
	+	has only the augend to destination dependency, otherwise it's 7 cycles.

Please mark this as submitted. Thanks.

You have a pending review.

Review Board 2.0.15

This change has been marked as submitted.

cpu, arm: Separate Float* from SimdFloat, add FloatMem opClass

Screenshots

Files

Issue Summary

Change Summary:

Diff:

Change Summary:

Summary:

Description:

Status: Closed (submitted)