Message ID | 20180119045438.28582-9-richard.henderson@linaro.org |
---|---|
State | Superseded |
Headers | show |
Series | target/arm: Prepatory work for SVE | expand |
Richard Henderson <richard.henderson@linaro.org> writes: > Change vfp.regs as a uint64_t to vfp.zregs as an ARMVectorReg. > The previous patches have made the change in representation > relatively painless. > > Signed-off-by: Richard Henderson <richard.henderson@linaro.org> > --- > target/arm/cpu.h | 57 ++++++++++++++++++++++++++++++---------------- > target/arm/machine.c | 35 +++++++++++++++++++++++++++- > target/arm/translate-a64.c | 8 +++---- > target/arm/translate.c | 7 +++--- > 4 files changed, 79 insertions(+), 28 deletions(-) > > diff --git a/target/arm/cpu.h b/target/arm/cpu.h > index 7d396606f3..57d805b5d8 100644 > --- a/target/arm/cpu.h > +++ b/target/arm/cpu.h > @@ -153,6 +153,40 @@ typedef struct { > uint32_t base_mask; > } TCR; > > +/* Define a maximum sized vector register. > + * For 32-bit, this is a 128-bit NEON/AdvSIMD register. > + * For 64-bit, this is a 2048-bit SVE register. > + * > + * Note that the mapping between S, D, and Q views of the register bank > + * differs between AArch64 and AArch32. > + * In AArch32: > + * Qn = regs[n].d[1]:regs[n].d[0] > + * Dn = regs[n / 2].d[n & 1] > + * Sn = regs[n / 4].d[n % 4 / 2], > + * bits 31..0 for even n, and bits 63..32 for odd n > + * (and regs[16] to regs[31] are inaccessible) > + * In AArch64: > + * Zn = regs[n].d[*] > + * Qn = regs[n].d[1]:regs[n].d[0] > + * Dn = regs[n].d[0] > + * Sn = regs[n].d[0] bits 31..0 > + * > + * This corresponds to the architecturally defined mapping between > + * the two execution states, and means we do not need to explicitly > + * map these registers when changing states. It might be worth prompting to use the helper functions to get the right offsets here. > + */ > + > +#ifdef TARGET_AARCH64 > +# define ARM_MAX_VQ 16 > +#else > +# define ARM_MAX_VQ 1 > +#endif > + > +typedef struct ARMVectorReg { > + uint64_t d[2 * ARM_MAX_VQ] QEMU_ALIGNED(16); > +} ARMVectorReg; > + The QEMU_ALIGNED also deserves a comment either here or in the comment block above. > + > typedef struct CPUARMState { > /* Regs for current mode. */ > uint32_t regs[16]; > @@ -477,22 +511,7 @@ typedef struct CPUARMState { > > /* VFP coprocessor state. */ > struct { > - /* VFP/Neon register state. Note that the mapping between S, D and Q > - * views of the register bank differs between AArch64 and AArch32: > - * In AArch32: > - * Qn = regs[2n+1]:regs[2n] > - * Dn = regs[n] > - * Sn = regs[n/2] bits 31..0 for even n, and bits 63..32 for odd n > - * (and regs[32] to regs[63] are inaccessible) > - * In AArch64: > - * Qn = regs[2n+1]:regs[2n] > - * Dn = regs[2n] > - * Sn = regs[2n] bits 31..0 > - * This corresponds to the architecturally defined mapping between > - * the two execution states, and means we do not need to explicitly > - * map these registers when changing states. > - */ > - uint64_t regs[64]; > + ARMVectorReg zregs[32]; > > uint32_t xregs[16]; > /* We store these fpcsr fields separately for convenience. */ > @@ -2891,7 +2910,7 @@ static inline void *arm_get_el_change_hook_opaque(ARMCPU *cpu) > */ > static inline uint64_t *aa32_vfp_dreg(CPUARMState *env, unsigned regno) > { > - return &env->vfp.regs[regno]; > + return &env->vfp.zregs[regno >> 1].d[regno & 1]; > } > > /** > @@ -2900,7 +2919,7 @@ static inline uint64_t *aa32_vfp_dreg(CPUARMState *env, unsigned regno) > */ > static inline uint64_t *aa32_vfp_qreg(CPUARMState *env, unsigned regno) > { > - return &env->vfp.regs[2 * regno]; > + return &env->vfp.zregs[regno].d[0]; > } > > /** > @@ -2909,7 +2928,7 @@ static inline uint64_t *aa32_vfp_qreg(CPUARMState *env, unsigned regno) > */ > static inline uint64_t *aa64_vfp_qreg(CPUARMState *env, unsigned regno) > { > - return &env->vfp.regs[2 * regno]; > + return &env->vfp.zregs[regno].d[0]; > } > > #endif > diff --git a/target/arm/machine.c b/target/arm/machine.c > index a85c2430d3..cb0e1c92bb 100644 > --- a/target/arm/machine.c > +++ b/target/arm/machine.c > @@ -50,7 +50,40 @@ static const VMStateDescription vmstate_vfp = { > .minimum_version_id = 3, > .needed = vfp_needed, > .fields = (VMStateField[]) { > - VMSTATE_UINT64_ARRAY(env.vfp.regs, ARMCPU, 64), > + /* For compatibility, store Qn out of Zn here. */ > + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[0].d, ARMCPU, 0, 2), > + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[1].d, ARMCPU, 0, 2), > + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[2].d, ARMCPU, 0, 2), > + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[3].d, ARMCPU, 0, 2), > + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[4].d, ARMCPU, 0, 2), > + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[5].d, ARMCPU, 0, 2), > + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[6].d, ARMCPU, 0, 2), > + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[7].d, ARMCPU, 0, 2), > + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[8].d, ARMCPU, 0, 2), > + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[9].d, ARMCPU, 0, 2), > + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[10].d, ARMCPU, 0, 2), > + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[11].d, ARMCPU, 0, 2), > + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[12].d, ARMCPU, 0, 2), > + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[13].d, ARMCPU, 0, 2), > + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[14].d, ARMCPU, 0, 2), > + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[15].d, ARMCPU, 0, 2), > + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[16].d, ARMCPU, 0, 2), > + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[17].d, ARMCPU, 0, 2), > + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[18].d, ARMCPU, 0, 2), > + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[19].d, ARMCPU, 0, 2), > + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[20].d, ARMCPU, 0, 2), > + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[21].d, ARMCPU, 0, 2), > + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[22].d, ARMCPU, 0, 2), > + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[23].d, ARMCPU, 0, 2), > + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[24].d, ARMCPU, 0, 2), > + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[25].d, ARMCPU, 0, 2), > + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[26].d, ARMCPU, 0, 2), > + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[27].d, ARMCPU, 0, 2), > + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[28].d, ARMCPU, 0, 2), > + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[29].d, ARMCPU, 0, 2), > + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[30].d, ARMCPU, 0, 2), > + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[31].d, ARMCPU, 0, 2), > + > /* The xregs array is a little awkward because element 1 (FPSCR) > * requires a specific accessor, so we have to split it up in > * the vmstate: > diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c > index eed64c73e5..10eef870fe 100644 > --- a/target/arm/translate-a64.c > +++ b/target/arm/translate-a64.c > @@ -517,8 +517,8 @@ static inline int vec_reg_offset(DisasContext *s, int regno, > { > int offs = 0; > #ifdef HOST_WORDS_BIGENDIAN > - /* This is complicated slightly because vfp.regs[2n] is > - * still the low half and vfp.regs[2n+1] the high half > + /* This is complicated slightly because vfp.zregs[n].d[0] is > + * still the low half and vfp.zregs[n].d[1] the high half > * of the 128 bit vector, even on big endian systems. > * Calculate the offset assuming a fully bigendian 128 bits, > * then XOR to account for the order of the two 64 bit halves. > @@ -528,7 +528,7 @@ static inline int vec_reg_offset(DisasContext *s, int regno, > #else > offs += element * (1 << size); > #endif > - offs += offsetof(CPUARMState, vfp.regs[regno * 2]); > + offs += offsetof(CPUARMState, vfp.zregs[regno]); > assert_fp_access_checked(s); > return offs; > } > @@ -537,7 +537,7 @@ static inline int vec_reg_offset(DisasContext *s, int regno, > static inline int vec_full_reg_offset(DisasContext *s, int regno) > { > assert_fp_access_checked(s); > - return offsetof(CPUARMState, vfp.regs[regno * 2]); > + return offsetof(CPUARMState, vfp.zregs[regno]); > } > > /* Return a newly allocated pointer to the vector register. */ > diff --git a/target/arm/translate.c b/target/arm/translate.c > index 55826b7e5a..a8c13d3758 100644 > --- a/target/arm/translate.c > +++ b/target/arm/translate.c > @@ -1512,13 +1512,12 @@ static inline void gen_vfp_st(DisasContext *s, int dp, TCGv_i32 addr) > } > } > > -static inline long > -vfp_reg_offset (int dp, int reg) > +static inline long vfp_reg_offset(bool dp, unsigned reg) > { > if (dp) { > - return offsetof(CPUARMState, vfp.regs[reg]); > + return offsetof(CPUARMState, vfp.zregs[reg >> 1].d[reg & 1]); > } else { > - long ofs = offsetof(CPUARMState, vfp.regs[reg >> 1]); > + long ofs = offsetof(CPUARMState, vfp.zregs[reg >> 2].d[(reg >> 1) & 1]); > if (reg & 1) { > ofs += offsetof(CPU_DoubleU, l.upper); > } else { Other than the minor comment notes it looks good: Reviewed-by: Alex Bennée <alex.bennee@linaro.org> -- Alex Bennée
diff --git a/target/arm/cpu.h b/target/arm/cpu.h index 7d396606f3..57d805b5d8 100644 --- a/target/arm/cpu.h +++ b/target/arm/cpu.h @@ -153,6 +153,40 @@ typedef struct { uint32_t base_mask; } TCR; +/* Define a maximum sized vector register. + * For 32-bit, this is a 128-bit NEON/AdvSIMD register. + * For 64-bit, this is a 2048-bit SVE register. + * + * Note that the mapping between S, D, and Q views of the register bank + * differs between AArch64 and AArch32. + * In AArch32: + * Qn = regs[n].d[1]:regs[n].d[0] + * Dn = regs[n / 2].d[n & 1] + * Sn = regs[n / 4].d[n % 4 / 2], + * bits 31..0 for even n, and bits 63..32 for odd n + * (and regs[16] to regs[31] are inaccessible) + * In AArch64: + * Zn = regs[n].d[*] + * Qn = regs[n].d[1]:regs[n].d[0] + * Dn = regs[n].d[0] + * Sn = regs[n].d[0] bits 31..0 + * + * This corresponds to the architecturally defined mapping between + * the two execution states, and means we do not need to explicitly + * map these registers when changing states. + */ + +#ifdef TARGET_AARCH64 +# define ARM_MAX_VQ 16 +#else +# define ARM_MAX_VQ 1 +#endif + +typedef struct ARMVectorReg { + uint64_t d[2 * ARM_MAX_VQ] QEMU_ALIGNED(16); +} ARMVectorReg; + + typedef struct CPUARMState { /* Regs for current mode. */ uint32_t regs[16]; @@ -477,22 +511,7 @@ typedef struct CPUARMState { /* VFP coprocessor state. */ struct { - /* VFP/Neon register state. Note that the mapping between S, D and Q - * views of the register bank differs between AArch64 and AArch32: - * In AArch32: - * Qn = regs[2n+1]:regs[2n] - * Dn = regs[n] - * Sn = regs[n/2] bits 31..0 for even n, and bits 63..32 for odd n - * (and regs[32] to regs[63] are inaccessible) - * In AArch64: - * Qn = regs[2n+1]:regs[2n] - * Dn = regs[2n] - * Sn = regs[2n] bits 31..0 - * This corresponds to the architecturally defined mapping between - * the two execution states, and means we do not need to explicitly - * map these registers when changing states. - */ - uint64_t regs[64]; + ARMVectorReg zregs[32]; uint32_t xregs[16]; /* We store these fpcsr fields separately for convenience. */ @@ -2891,7 +2910,7 @@ static inline void *arm_get_el_change_hook_opaque(ARMCPU *cpu) */ static inline uint64_t *aa32_vfp_dreg(CPUARMState *env, unsigned regno) { - return &env->vfp.regs[regno]; + return &env->vfp.zregs[regno >> 1].d[regno & 1]; } /** @@ -2900,7 +2919,7 @@ static inline uint64_t *aa32_vfp_dreg(CPUARMState *env, unsigned regno) */ static inline uint64_t *aa32_vfp_qreg(CPUARMState *env, unsigned regno) { - return &env->vfp.regs[2 * regno]; + return &env->vfp.zregs[regno].d[0]; } /** @@ -2909,7 +2928,7 @@ static inline uint64_t *aa32_vfp_qreg(CPUARMState *env, unsigned regno) */ static inline uint64_t *aa64_vfp_qreg(CPUARMState *env, unsigned regno) { - return &env->vfp.regs[2 * regno]; + return &env->vfp.zregs[regno].d[0]; } #endif diff --git a/target/arm/machine.c b/target/arm/machine.c index a85c2430d3..cb0e1c92bb 100644 --- a/target/arm/machine.c +++ b/target/arm/machine.c @@ -50,7 +50,40 @@ static const VMStateDescription vmstate_vfp = { .minimum_version_id = 3, .needed = vfp_needed, .fields = (VMStateField[]) { - VMSTATE_UINT64_ARRAY(env.vfp.regs, ARMCPU, 64), + /* For compatibility, store Qn out of Zn here. */ + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[0].d, ARMCPU, 0, 2), + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[1].d, ARMCPU, 0, 2), + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[2].d, ARMCPU, 0, 2), + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[3].d, ARMCPU, 0, 2), + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[4].d, ARMCPU, 0, 2), + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[5].d, ARMCPU, 0, 2), + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[6].d, ARMCPU, 0, 2), + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[7].d, ARMCPU, 0, 2), + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[8].d, ARMCPU, 0, 2), + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[9].d, ARMCPU, 0, 2), + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[10].d, ARMCPU, 0, 2), + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[11].d, ARMCPU, 0, 2), + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[12].d, ARMCPU, 0, 2), + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[13].d, ARMCPU, 0, 2), + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[14].d, ARMCPU, 0, 2), + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[15].d, ARMCPU, 0, 2), + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[16].d, ARMCPU, 0, 2), + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[17].d, ARMCPU, 0, 2), + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[18].d, ARMCPU, 0, 2), + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[19].d, ARMCPU, 0, 2), + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[20].d, ARMCPU, 0, 2), + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[21].d, ARMCPU, 0, 2), + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[22].d, ARMCPU, 0, 2), + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[23].d, ARMCPU, 0, 2), + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[24].d, ARMCPU, 0, 2), + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[25].d, ARMCPU, 0, 2), + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[26].d, ARMCPU, 0, 2), + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[27].d, ARMCPU, 0, 2), + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[28].d, ARMCPU, 0, 2), + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[29].d, ARMCPU, 0, 2), + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[30].d, ARMCPU, 0, 2), + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[31].d, ARMCPU, 0, 2), + /* The xregs array is a little awkward because element 1 (FPSCR) * requires a specific accessor, so we have to split it up in * the vmstate: diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c index eed64c73e5..10eef870fe 100644 --- a/target/arm/translate-a64.c +++ b/target/arm/translate-a64.c @@ -517,8 +517,8 @@ static inline int vec_reg_offset(DisasContext *s, int regno, { int offs = 0; #ifdef HOST_WORDS_BIGENDIAN - /* This is complicated slightly because vfp.regs[2n] is - * still the low half and vfp.regs[2n+1] the high half + /* This is complicated slightly because vfp.zregs[n].d[0] is + * still the low half and vfp.zregs[n].d[1] the high half * of the 128 bit vector, even on big endian systems. * Calculate the offset assuming a fully bigendian 128 bits, * then XOR to account for the order of the two 64 bit halves. @@ -528,7 +528,7 @@ static inline int vec_reg_offset(DisasContext *s, int regno, #else offs += element * (1 << size); #endif - offs += offsetof(CPUARMState, vfp.regs[regno * 2]); + offs += offsetof(CPUARMState, vfp.zregs[regno]); assert_fp_access_checked(s); return offs; } @@ -537,7 +537,7 @@ static inline int vec_reg_offset(DisasContext *s, int regno, static inline int vec_full_reg_offset(DisasContext *s, int regno) { assert_fp_access_checked(s); - return offsetof(CPUARMState, vfp.regs[regno * 2]); + return offsetof(CPUARMState, vfp.zregs[regno]); } /* Return a newly allocated pointer to the vector register. */ diff --git a/target/arm/translate.c b/target/arm/translate.c index 55826b7e5a..a8c13d3758 100644 --- a/target/arm/translate.c +++ b/target/arm/translate.c @@ -1512,13 +1512,12 @@ static inline void gen_vfp_st(DisasContext *s, int dp, TCGv_i32 addr) } } -static inline long -vfp_reg_offset (int dp, int reg) +static inline long vfp_reg_offset(bool dp, unsigned reg) { if (dp) { - return offsetof(CPUARMState, vfp.regs[reg]); + return offsetof(CPUARMState, vfp.zregs[reg >> 1].d[reg & 1]); } else { - long ofs = offsetof(CPUARMState, vfp.regs[reg >> 1]); + long ofs = offsetof(CPUARMState, vfp.zregs[reg >> 2].d[(reg >> 1) & 1]); if (reg & 1) { ofs += offsetof(CPU_DoubleU, l.upper); } else {
Change vfp.regs as a uint64_t to vfp.zregs as an ARMVectorReg. The previous patches have made the change in representation relatively painless. Signed-off-by: Richard Henderson <richard.henderson@linaro.org> --- target/arm/cpu.h | 57 ++++++++++++++++++++++++++++++---------------- target/arm/machine.c | 35 +++++++++++++++++++++++++++- target/arm/translate-a64.c | 8 +++---- target/arm/translate.c | 7 +++--- 4 files changed, 79 insertions(+), 28 deletions(-) -- 2.14.3