cpu, arm: Separate Float* from SimdFloat*, add FloatMem* opClass
Review Request #3547 - Created July 10, 2016 and submitted - Latest diff uploaded
| Information | |
|---|---|
| Fernando Endo | |
| gem5 | |
| Reviewers | |
| Default | |
Modify the opClass assigned to AArch64 FP instructions from SimdFloat* to
Float*. Also create the FloatMemRead and FloatMemWrite opClasses, which
distinguishes writes to the INT and FP register banks.
Change the latency of (Simd)FloatMultAcc to 5, based on the Cortex-A72,
where the "latency" of FMADD is 3 if the next instruction is a FMADD and
has only the augend to destination dependency, otherwise it's 7 cycles.
The changes from SimdFloat to Float were quickly tested, indeed the neon64.isa and fp64.isa files already separate the implementations, so it should not be buggy.
Load and store instructions tested, followed by #uops and opClass from stats.txt:asm("ldr X2, [X30], #0"); // 1 MemRead asm("ldp X2, X3, [X30], #0"); // 1 MemRead asm("ldpsw X2, X3, [X30], #0"); // 1 MemRead asm("ldr W2, [X30], #0"); // 1 MemRead asm("ldp W2, W3, [X30], #0"); // 1 MemRead asm("ldp Q0, Q1, [SP], #0"); // 2 FloatMemRead asm("ldp D2, D3, [SP], #0"); // 1 FloatMemRead asm("ldp S2, S3, [X30], #0"); // 1 FloatMemRead asm("ldr Q4, [SP], #0"); // 1 FloatMemRead asm("ldr Q4, [SP, X28]"); // 1 FloatMemRead asm("ldr D5, [SP], #0"); // 1 FloatMemRead asm("ldr D5, [SP, X28]"); // 1 FloatMemRead asm("ldr S6, [SP], #0"); // 1 FloatMemRead asm("ldr S2, [X30, X28]"); // 1 FloatMemRead asm("ldr H7, [X30], #0"); // 1 FloatMemRead asm("ldr B8, [SP], #0"); // 1 FloatMemRead asm("ldur B0, [X30, #0]"); // 1 FloatMemRead asm("ldur H0, [X30, #0]"); // 1 FloatMemRead asm("ldur S0, [X30, #0]"); // 1 FloatMemRead asm("ldur D0, [X30, #0]"); // 1 FloatMemRead asm("ldur Q0, [X30, #0]"); // 1 FloatMemRead asm("ld1 {V9.B}[0], [X30]"); // 1 FloatMemRead asm("ld1 {V9.B}[15], [X30]"); // 1 FloatMemRead asm("ld1 {V9.H}[0], [X30]"); // 1 FloatMemRead asm("ld1 {V9.H}[7], [X30]"); // 1 FloatMemRead asm("ld1 {V9.S}[0], [X30]"); // 1 FloatMemRead asm("ld1 {V9.S}[3], [X30]"); // 1 FloatMemRead asm("ld1 {V9.D}[0], [X30]"); // 1 FloatMemRead asm("ld1 {V9.D}[1], [X30]"); // 1 FloatMemRead asm("ld2 {V9.B, V10.B}[0], [X30]"); // 1 FloatMemRead asm("ld2 {V9.B, V10.B}[15], [X30]"); // 1 FloatMemRead asm("ld2 {V9.H, V10.H}[0], [X30]"); // 1 FloatMemRead asm("ld2 {V9.H, V10.H}[7], [X30]"); // 1 FloatMemRead asm("ld2 {V9.S, V10.S}[0], [X30]"); // 1 FloatMemRead asm("ld2 {V9.S, V10.S}[3], [X30]"); // 1 FloatMemRead asm("ld2 {V9.D, V10.D}[0], [X30]"); // 1 FloatMemRead asm("ld2 {V9.D, V10.D}[1], [X30]"); // 1 FloatMemRead asm("ld3 {V9.B, V10.B, V11.B}[0], [X30]"); // 1 FloatMemRead asm("ld3 {V9.B, V10.B, V11.B}[15], [X30]"); // 1 FloatMemRead asm("ld3 {V9.H, V10.H, V11.H}[0], [X30]"); // 1 FloatMemRead asm("ld3 {V9.H, V10.H, V11.H}[7], [X30]"); // 1 FloatMemRead asm("ld3 {V9.S, V10.S, V11.S}[0], [X30]"); // 1 FloatMemRead asm("ld3 {V9.S, V10.S, V11.S}[3], [X30]"); // 1 FloatMemRead asm("ld3 {V9.D, V10.D, V11.D}[0], [X30]"); // 2 FloatMemRead asm("ld3 {V9.D, V10.D, V11.D}[1], [X30]"); // 2 FloatMemRead asm("ld4 {V9.B, V10.B, V11.B, V12.B}[0], [X30]"); // 1 FloatMemRead asm("ld4 {V9.B, V10.B, V11.B, V12.B}[15], [X30]"); // 1 FloatMemRead asm("ld4 {V9.H, V10.H, V11.H, V12.H}[0], [X30]"); // 1 FloatMemRead asm("ld4 {V9.H, V10.H, V11.H, V12.H}[7], [X30]"); // 1 FloatMemRead asm("ld4 {V9.S, V10.S, V11.S, V12.S}[0], [X30]"); // 1 FloatMemRead asm("ld4 {V9.S, V10.S, V11.S, V12.S}[3], [X30]"); // 1 FloatMemRead asm("ld4 {V9.D, V10.D, V11.D, V12.D}[0], [X30]"); // 2 FloatMemRead asm("ld4 {V9.D, V10.D, V11.D, V12.D}[1], [X30]"); // 2 FloatMemRead asm("ld1 {V9.16B}, [X30], #16"); // 1 FloatMemRead asm("ld1 {V9.8H, V10.8H}, [X30], #32"); // 2 FloatMemRead asm("ld1 {V9.4S, V10.4S, V11.4S}, [X30], #48"); // 3 FloatMemRead asm("ld1 {V9.2D, V10.2D, V11.2D, V12.2D}, [X30], #64"); // 4 FloatMemRead asm("ld2 {V9.8H, V10.8H}, [X30], #32"); // 2 FloatMemRead asm("ld3 {V9.4S, V10.4S, V11.4S}, [X30], #48"); // 3 FloatMemRead asm("ld4 {V9.2D, V10.2D, V11.2D, V12.2D}, [X30], #64"); // 4 FloatMemRead asm("ld1r {V9.16B}, [X30]"); // 1 FloatMemRead asm("ld1r {V9.8H}, [X30]"); // 1 FloatMemRead asm("ld1r {V9.4S}, [X30]"); // 1 FloatMemRead asm("ld1r {V9.2D}, [X30]"); // 1 FloatMemRead asm("ld2r {V9.16B, V10.16B}, [X30]"); // 1 FloatMemRead asm("ld2r {V9.8H, V10.8H}, [X30]"); // 1 FloatMemRead asm("ld2r {V9.4S, V10.4S}, [X30]"); // 1 FloatMemRead asm("ld2r {V9.2D, V10.2D}, [X30]"); // 1 FloatMemRead asm("ld3r {V9.16B, V10.16B, V11.16B}, [X30]"); // 1 FloatMemRead asm("ld3r {V9.8H, V10.8H, V11.8H}, [X30]"); // 1 FloatMemRead asm("ld3r {V9.4S, V10.4S, V11.4S}, [X30]"); // 1 FloatMemRead asm("ld3r {V9.2D, V10.2D, V11.2D}, [X30]"); // 2 FloatMemRead asm("ld4r {V9.16B, V10.16B, V11.16B, V12.16B}, [X30]"); // 1 FloatMemRead asm("ld4r {V9.8H, V10.8H, V11.8H, V12.8H}, [X30]"); // 1 FloatMemRead asm("ld4r {V9.4S, V10.4S, V11.4S, V12.4S}, [X30]"); // 1 FloatMemRead asm("ld4r {V9.2D, V10.2D, V11.2D, V12.2D}, [X30]"); // 2 FloatMemRead asm("str X2, [X30], #0"); // 1 MemWrite asm("stp X2, X3, [X30], #0"); // 2 MemWrite asm("str W2, [X30], #0"); // 1 MemWrite asm("stp W2, W3, [X30], #0"); // 1 MemWrite asm("stp Q0, Q1, [X29], #0"); // 4 FloatMemWrite asm("stp D2, D3, [X29], #0"); // 2 FloatMemWrite asm("stp S2, S3, [X29], #0"); // 1 FloatMemWrite asm("str Q4, [X29], #0"); // 2 FloatMemWrite asm("str D5, [X29], #0"); // 1 FloatMemWrite asm("str S6, [X29], #0"); // 1 FloatMemWrite asm("str H7, [X29], #0"); // 1 FloatMemWrite asm("str B8, [X29], #0"); // 1 FloatMemWrite asm("stur B0, [X29, #0]"); // 1 FloatMemWrite asm("stur H0, [X29, #0]"); // 1 FloatMemWrite asm("stur S0, [X29, #0]"); // 1 FloatMemWrite asm("stur D0, [X29, #0]"); // 1 FloatMemWrite asm("stur Q0, [X29, #0]"); // 2 FloatMemWrite asm("st1 {V9.B}[0], [X29]"); // 1 FloatMemWrite asm("st1 {V9.B}[15], [X29]"); // 1 FloatMemWrite asm("st1 {V9.H}[0], [X29]"); // 1 FloatMemWrite asm("st1 {V9.H}[7], [X29]"); // 1 FloatMemWrite asm("st1 {V9.S}[0], [X29]"); // 1 FloatMemWrite asm("st1 {V9.S}[3], [X29]"); // 1 FloatMemWrite asm("st1 {V9.D}[0], [X29]"); // 1 FloatMemWrite asm("st1 {V9.D}[1], [X29]"); // 1 FloatMemWrite asm("st2 {V9.B, V10.B}[0], [X29]"); // 1 FloatMemWrite asm("st2 {V9.B, V10.B}[15], [X29]"); // 1 FloatMemWrite asm("st2 {V9.H, V10.H}[0], [X29]"); // 1 FloatMemWrite asm("st2 {V9.H, V10.H}[7], [X29]"); // 1 FloatMemWrite asm("st2 {V9.S, V10.S}[0], [X29]"); // 1 FloatMemWrite asm("st2 {V9.S, V10.S}[3], [X29]"); // 1 FloatMemWrite asm("st2 {V9.D, V10.D}[0], [X29]"); // 1 FloatMemWrite asm("st2 {V9.D, V10.D}[1], [X29]"); // 1 FloatMemWrite asm("st3 {V9.B, V10.B, V11.B}[0], [X29]"); // 1 FloatMemWrite asm("st3 {V9.B, V10.B, V11.B}[15], [X29]"); // 1 FloatMemWrite asm("st3 {V9.H, V10.H, V11.H}[0], [X29]"); // 1 FloatMemWrite asm("st3 {V9.H, V10.H, V11.H}[7], [X29]"); // 1 FloatMemWrite asm("st3 {V9.S, V10.S, V11.S}[0], [X29]"); // 1 FloatMemWrite asm("st3 {V9.S, V10.S, V11.S}[3], [X29]"); // 1 FloatMemWrite asm("st3 {V9.D, V10.D, V11.D}[0], [X29]"); // 2 FloatMemWrite asm("st3 {V9.D, V10.D, V11.D}[1], [X29]"); // 2 FloatMemWrite asm("st4 {V9.B, V10.B, V11.B, V12.B}[0], [X29]"); // 1 FloatMemWrite asm("st4 {V9.B, V10.B, V11.B, V12.B}[15], [X29]"); // 1 FloatMemWrite asm("st4 {V9.H, V10.H, V11.H, V12.H}[0], [X29]"); // 1 FloatMemWrite asm("st4 {V9.H, V10.H, V11.H, V12.H}[7], [X29]"); // 1 FloatMemWrite asm("st4 {V9.S, V10.S, V11.S, V12.S}[0], [X29]"); // 1 FloatMemWrite asm("st4 {V9.S, V10.S, V11.S, V12.S}[3], [X29]"); // 1 FloatMemWrite asm("st4 {V9.D, V10.D, V11.D, V12.D}[0], [X29]"); // 2 FloatMemWrite asm("st4 {V9.D, V10.D, V11.D, V12.D}[1], [X29]"); // 2 FloatMemWrite asm("st1 {V9.16B}, [X29], #16"); // 1 FloatMemWrite asm("st1 {V9.8H, V10.8H}, [X29], #32"); // 2 FloatMemWrite asm("st1 {V9.4S, V10.4S, V11.4S}, [X29], #48"); // 3 FloatMemWrite asm("st1 {V9.2D, V10.2D, V11.2D, V12.2D}, [X29], #64"); // 4 FloatMemWrite asm("st2 {V9.8H, V10.8H}, [X29], #32"); // 2 FloatMemWrite asm("st3 {V9.4S, V10.4S, V11.4S}, [X29], #48"); // 3 FloatMemWrite asm("st4 {V9.2D, V10.2D, V11.2D, V12.2D}, [X29], #64"); // 4 FloatMemWrite
