@@ -665,6 +665,68 @@ static void write_fp_sreg(DisasContext *s, int reg, TCGv_i32 v)
write_fp_dreg(s, reg, tmp);
}
+/*
+ * Write a double result to 128 bit vector register reg, honouring FPCR.NEP:
+ * - if FPCR.NEP == 0, clear the high elements of reg
+ * - if FPCR.NEP == 1, set the high elements of reg from mergereg
+ * (i.e. merge the result with those high elements)
+ * In either case, SVE register bits above 128 are zeroed (per R_WKYLB).
+ */
+static void write_fp_dreg_merging(DisasContext *s, int reg, int mergereg,
+ TCGv_i64 v)
+{
+ if (!s->fpcr_nep) {
+ write_fp_dreg(s, reg, v);
+ return;
+ }
+
+ /*
+ * Move from mergereg to reg; this sets the high elements and
+ * clears the bits above 128 as a side effect.
+ */
+ tcg_gen_gvec_mov(MO_64, fp_reg_offset(s, reg, MO_64),
+ fp_reg_offset(s, mergereg, MO_64),
+ 16, vec_full_reg_size(s));
+ tcg_gen_st_i64(v, tcg_env, fp_reg_offset(s, reg, MO_64));
+}
+
+/*
+ * Write a single-prec result, but only clear the higher elements
+ * of the destination register if FPCR.NEP is 0; otherwise preserve them.
+ */
+static void write_fp_sreg_merging(DisasContext *s, int reg, int mergereg,
+ TCGv_i32 v)
+{
+ if (!s->fpcr_nep) {
+ write_fp_sreg(s, reg, v);
+ return;
+ }
+
+ tcg_gen_gvec_mov(MO_64, fp_reg_offset(s, reg, MO_64),
+ fp_reg_offset(s, mergereg, MO_64),
+ 16, vec_full_reg_size(s));
+ tcg_gen_st_i32(v, tcg_env, fp_reg_offset(s, reg, MO_32));
+}
+
+/*
+ * Write a half-prec result, but only clear the higher elements
+ * of the destination register if FPCR.NEP is 0; otherwise preserve them.
+ * The caller must ensure that the top 16 bits of v are zero.
+ */
+static void write_fp_hreg_merging(DisasContext *s, int reg, int mergereg,
+ TCGv_i32 v)
+{
+ if (!s->fpcr_nep) {
+ write_fp_sreg(s, reg, v);
+ return;
+ }
+
+ tcg_gen_gvec_mov(MO_64, fp_reg_offset(s, reg, MO_64),
+ fp_reg_offset(s, mergereg, MO_64),
+ 16, vec_full_reg_size(s));
+ tcg_gen_st16_i32(v, tcg_env, fp_reg_offset(s, reg, MO_16));
+}
+
/* Expand a 2-operand AdvSIMD vector operation using an expander function. */
static void gen_gvec_fn2(DisasContext *s, bool is_q, int rd, int rn,
GVecGen2Fn *gvec_fn, int vece)
@@ -5038,7 +5100,7 @@ typedef struct FPScalar {
} FPScalar;
static bool do_fp3_scalar_with_fpsttype(DisasContext *s, arg_rrr_e *a,
- const FPScalar *f,
+ const FPScalar *f, int mergereg,
ARMFPStatusFlavour fpsttype)
{
switch (a->esz) {
@@ -5047,7 +5109,7 @@ static bool do_fp3_scalar_with_fpsttype(DisasContext *s, arg_rrr_e *a,
TCGv_i64 t0 = read_fp_dreg(s, a->rn);
TCGv_i64 t1 = read_fp_dreg(s, a->rm);
f->gen_d(t0, t0, t1, fpstatus_ptr(fpsttype));
- write_fp_dreg(s, a->rd, t0);
+ write_fp_dreg_merging(s, a->rd, mergereg, t0);
}
break;
case MO_32:
@@ -5055,7 +5117,7 @@ static bool do_fp3_scalar_with_fpsttype(DisasContext *s, arg_rrr_e *a,
TCGv_i32 t0 = read_fp_sreg(s, a->rn);
TCGv_i32 t1 = read_fp_sreg(s, a->rm);
f->gen_s(t0, t0, t1, fpstatus_ptr(fpsttype));
- write_fp_sreg(s, a->rd, t0);
+ write_fp_sreg_merging(s, a->rd, mergereg, t0);
}
break;
case MO_16:
@@ -5066,7 +5128,7 @@ static bool do_fp3_scalar_with_fpsttype(DisasContext *s, arg_rrr_e *a,
TCGv_i32 t0 = read_fp_hreg(s, a->rn);
TCGv_i32 t1 = read_fp_hreg(s, a->rm);
f->gen_h(t0, t0, t1, fpstatus_ptr(fpsttype));
- write_fp_sreg(s, a->rd, t0);
+ write_fp_hreg_merging(s, a->rd, mergereg, t0);
}
break;
default:
@@ -5075,16 +5137,19 @@ static bool do_fp3_scalar_with_fpsttype(DisasContext *s, arg_rrr_e *a,
return true;
}
-static bool do_fp3_scalar(DisasContext *s, arg_rrr_e *a, const FPScalar *f)
+static bool do_fp3_scalar(DisasContext *s, arg_rrr_e *a, const FPScalar *f,
+ int mergereg)
{
- return do_fp3_scalar_with_fpsttype(s, a, f,
+ return do_fp3_scalar_with_fpsttype(s, a, f, mergereg,
a->esz == MO_16 ?
FPST_FPCR_F16_A64 : FPST_FPCR_A64);
}
-static bool do_fp3_scalar_ah(DisasContext *s, arg_rrr_e *a, const FPScalar *f)
+static bool do_fp3_scalar_ah(DisasContext *s, arg_rrr_e *a, const FPScalar *f,
+ int mergereg)
{
- return do_fp3_scalar_with_fpsttype(s, a, f, select_fpst(s, a->esz));
+ return do_fp3_scalar_with_fpsttype(s, a, f, mergereg,
+ select_fpst(s, a->esz));
}
static const FPScalar f_scalar_fadd = {
@@ -5092,63 +5157,63 @@ static const FPScalar f_scalar_fadd = {
gen_helper_vfp_adds,
gen_helper_vfp_addd,
};
-TRANS(FADD_s, do_fp3_scalar, a, &f_scalar_fadd)
+TRANS(FADD_s, do_fp3_scalar, a, &f_scalar_fadd, a->rn)
static const FPScalar f_scalar_fsub = {
gen_helper_vfp_subh,
gen_helper_vfp_subs,
gen_helper_vfp_subd,
};
-TRANS(FSUB_s, do_fp3_scalar, a, &f_scalar_fsub)
+TRANS(FSUB_s, do_fp3_scalar, a, &f_scalar_fsub, a->rn)
static const FPScalar f_scalar_fdiv = {
gen_helper_vfp_divh,
gen_helper_vfp_divs,
gen_helper_vfp_divd,
};
-TRANS(FDIV_s, do_fp3_scalar, a, &f_scalar_fdiv)
+TRANS(FDIV_s, do_fp3_scalar, a, &f_scalar_fdiv, a->rn)
static const FPScalar f_scalar_fmul = {
gen_helper_vfp_mulh,
gen_helper_vfp_muls,
gen_helper_vfp_muld,
};
-TRANS(FMUL_s, do_fp3_scalar, a, &f_scalar_fmul)
+TRANS(FMUL_s, do_fp3_scalar, a, &f_scalar_fmul, a->rn)
static const FPScalar f_scalar_fmax = {
gen_helper_vfp_maxh,
gen_helper_vfp_maxs,
gen_helper_vfp_maxd,
};
-TRANS(FMAX_s, do_fp3_scalar, a, &f_scalar_fmax)
+TRANS(FMAX_s, do_fp3_scalar, a, &f_scalar_fmax, a->rn)
static const FPScalar f_scalar_fmin = {
gen_helper_vfp_minh,
gen_helper_vfp_mins,
gen_helper_vfp_mind,
};
-TRANS(FMIN_s, do_fp3_scalar, a, &f_scalar_fmin)
+TRANS(FMIN_s, do_fp3_scalar, a, &f_scalar_fmin, a->rn)
static const FPScalar f_scalar_fmaxnm = {
gen_helper_vfp_maxnumh,
gen_helper_vfp_maxnums,
gen_helper_vfp_maxnumd,
};
-TRANS(FMAXNM_s, do_fp3_scalar, a, &f_scalar_fmaxnm)
+TRANS(FMAXNM_s, do_fp3_scalar, a, &f_scalar_fmaxnm, a->rn)
static const FPScalar f_scalar_fminnm = {
gen_helper_vfp_minnumh,
gen_helper_vfp_minnums,
gen_helper_vfp_minnumd,
};
-TRANS(FMINNM_s, do_fp3_scalar, a, &f_scalar_fminnm)
+TRANS(FMINNM_s, do_fp3_scalar, a, &f_scalar_fminnm, a->rn)
static const FPScalar f_scalar_fmulx = {
gen_helper_advsimd_mulxh,
gen_helper_vfp_mulxs,
gen_helper_vfp_mulxd,
};
-TRANS(FMULX_s, do_fp3_scalar, a, &f_scalar_fmulx)
+TRANS(FMULX_s, do_fp3_scalar, a, &f_scalar_fmulx, a->rn)
static void gen_fnmul_h(TCGv_i32 d, TCGv_i32 n, TCGv_i32 m, TCGv_ptr s)
{
@@ -5173,42 +5238,42 @@ static const FPScalar f_scalar_fnmul = {
gen_fnmul_s,
gen_fnmul_d,
};
-TRANS(FNMUL_s, do_fp3_scalar, a, &f_scalar_fnmul)
+TRANS(FNMUL_s, do_fp3_scalar, a, &f_scalar_fnmul, a->rn)
static const FPScalar f_scalar_fcmeq = {
gen_helper_advsimd_ceq_f16,
gen_helper_neon_ceq_f32,
gen_helper_neon_ceq_f64,
};
-TRANS(FCMEQ_s, do_fp3_scalar, a, &f_scalar_fcmeq)
+TRANS(FCMEQ_s, do_fp3_scalar, a, &f_scalar_fcmeq, a->rm)
static const FPScalar f_scalar_fcmge = {
gen_helper_advsimd_cge_f16,
gen_helper_neon_cge_f32,
gen_helper_neon_cge_f64,
};
-TRANS(FCMGE_s, do_fp3_scalar, a, &f_scalar_fcmge)
+TRANS(FCMGE_s, do_fp3_scalar, a, &f_scalar_fcmge, a->rm)
static const FPScalar f_scalar_fcmgt = {
gen_helper_advsimd_cgt_f16,
gen_helper_neon_cgt_f32,
gen_helper_neon_cgt_f64,
};
-TRANS(FCMGT_s, do_fp3_scalar, a, &f_scalar_fcmgt)
+TRANS(FCMGT_s, do_fp3_scalar, a, &f_scalar_fcmgt, a->rm)
static const FPScalar f_scalar_facge = {
gen_helper_advsimd_acge_f16,
gen_helper_neon_acge_f32,
gen_helper_neon_acge_f64,
};
-TRANS(FACGE_s, do_fp3_scalar, a, &f_scalar_facge)
+TRANS(FACGE_s, do_fp3_scalar, a, &f_scalar_facge, a->rm)
static const FPScalar f_scalar_facgt = {
gen_helper_advsimd_acgt_f16,
gen_helper_neon_acgt_f32,
gen_helper_neon_acgt_f64,
};
-TRANS(FACGT_s, do_fp3_scalar, a, &f_scalar_facgt)
+TRANS(FACGT_s, do_fp3_scalar, a, &f_scalar_facgt, a->rm)
static void gen_fabd_h(TCGv_i32 d, TCGv_i32 n, TCGv_i32 m, TCGv_ptr s)
{
@@ -5233,21 +5298,21 @@ static const FPScalar f_scalar_fabd = {
gen_fabd_s,
gen_fabd_d,
};
-TRANS(FABD_s, do_fp3_scalar, a, &f_scalar_fabd)
+TRANS(FABD_s, do_fp3_scalar, a, &f_scalar_fabd, a->rn)
static const FPScalar f_scalar_frecps = {
gen_helper_recpsf_f16,
gen_helper_recpsf_f32,
gen_helper_recpsf_f64,
};
-TRANS(FRECPS_s, do_fp3_scalar_ah, a, &f_scalar_frecps)
+TRANS(FRECPS_s, do_fp3_scalar_ah, a, &f_scalar_frecps, a->rn)
static const FPScalar f_scalar_frsqrts = {
gen_helper_rsqrtsf_f16,
gen_helper_rsqrtsf_f32,
gen_helper_rsqrtsf_f64,
};
-TRANS(FRSQRTS_s, do_fp3_scalar_ah, a, &f_scalar_frsqrts)
+TRANS(FRSQRTS_s, do_fp3_scalar_ah, a, &f_scalar_frsqrts, a->rn)
static bool do_fcmp0_s(DisasContext *s, arg_rr_e *a,
const FPScalar *f, bool swap)
For FEAT_AFP's FPCR.NEP bit, we need to programmatically change the behaviour of the writeback of the result for most SIMD scalar operations, so that instead of zeroing the upper part of the result register it merges the upper elements from one of the input registers. Provide new functions write_fp_*reg_merging() which can be used instead of the existing write_fp_*reg() functions when we want this "merge the result with one of the input registers if FPCR.NEP is enabled" handling, and use them in do_fp3_scalar_with_fpsttype(). Note that (as documented in the description of the FPCR.NEP bit) which input register to use as the merge source varies by instruction: for these 2-input scalar operations, the comparison instructions take from Rm, not Rn. We'll extend this to also provide the merging behaviour for the remaining scalar insns in subsequent commits. Signed-off-by: Peter Maydell <peter.maydell@linaro.org> --- target/arm/tcg/translate-a64.c | 117 +++++++++++++++++++++++++-------- 1 file changed, 91 insertions(+), 26 deletions(-)