@@ -665,6 +665,68 @@ static void write_fp_sreg(DisasContext *s, int reg, TCGv_i32 v)
write_fp_dreg(s, reg, tmp);
}
+/*
+ * Write a double result to 128 bit vector register reg, honouring FPCR.NEP:
+ * - if FPCR.NEP == 0, clear the high elements of reg
+ * - if FPCR.NEP == 1, set the high elements of reg from mergereg
+ * (i.e. merge the result with those high elements)
+ * In either case, SVE register bits above 128 are zeroed (per R_WKYLB).
+ */
+static void write_fp_dreg_merging(DisasContext *s, int reg, int mergereg,
+ TCGv_i64 v)
+{
+ if (!s->fpcr_nep) {
+ write_fp_dreg(s, reg, v);
+ return;
+ }
+
+ /*
+ * Move from mergereg to reg; this sets the high elements and
+ * clears the bits above 128 as a side effect.
+ */
+ tcg_gen_gvec_mov(MO_64, vec_full_reg_offset(s, reg),
+ vec_full_reg_offset(s, mergereg),
+ 16, vec_full_reg_size(s));
+ tcg_gen_st_i64(v, tcg_env, vec_full_reg_offset(s, reg));
+}
+
+/*
+ * Write a single-prec result, but only clear the higher elements
+ * of the destination register if FPCR.NEP is 0; otherwise preserve them.
+ */
+static void write_fp_sreg_merging(DisasContext *s, int reg, int mergereg,
+ TCGv_i32 v)
+{
+ if (!s->fpcr_nep) {
+ write_fp_sreg(s, reg, v);
+ return;
+ }
+
+ tcg_gen_gvec_mov(MO_64, vec_full_reg_offset(s, reg),
+ vec_full_reg_offset(s, mergereg),
+ 16, vec_full_reg_size(s));
+ tcg_gen_st_i32(v, tcg_env, fp_reg_offset(s, reg, MO_32));
+}
+
+/*
+ * Write a half-prec result, but only clear the higher elements
+ * of the destination register if FPCR.NEP is 0; otherwise preserve them.
+ * The caller must ensure that the top 16 bits of v are zero.
+ */
+static void write_fp_hreg_merging(DisasContext *s, int reg, int mergereg,
+ TCGv_i32 v)
+{
+ if (!s->fpcr_nep) {
+ write_fp_sreg(s, reg, v);
+ return;
+ }
+
+ tcg_gen_gvec_mov(MO_64, vec_full_reg_offset(s, reg),
+ vec_full_reg_offset(s, mergereg),
+ 16, vec_full_reg_size(s));
+ tcg_gen_st16_i32(v, tcg_env, fp_reg_offset(s, reg, MO_16));
+}
+
/* Expand a 2-operand AdvSIMD vector operation using an expander function. */
static void gen_gvec_fn2(DisasContext *s, bool is_q, int rd, int rn,
GVecGen2Fn *gvec_fn, int vece)
@@ -5038,7 +5100,7 @@ typedef struct FPScalar {
} FPScalar;
static bool do_fp3_scalar_with_fpsttype(DisasContext *s, arg_rrr_e *a,
- const FPScalar *f,
+ const FPScalar *f, int mergereg,
ARMFPStatusFlavour fpsttype)
{
switch (a->esz) {
@@ -5047,7 +5109,7 @@ static bool do_fp3_scalar_with_fpsttype(DisasContext *s, arg_rrr_e *a,
TCGv_i64 t0 = read_fp_dreg(s, a->rn);
TCGv_i64 t1 = read_fp_dreg(s, a->rm);
f->gen_d(t0, t0, t1, fpstatus_ptr(fpsttype));
- write_fp_dreg(s, a->rd, t0);
+ write_fp_dreg_merging(s, a->rd, mergereg, t0);
}
break;
case MO_32:
@@ -5055,7 +5117,7 @@ static bool do_fp3_scalar_with_fpsttype(DisasContext *s, arg_rrr_e *a,
TCGv_i32 t0 = read_fp_sreg(s, a->rn);
TCGv_i32 t1 = read_fp_sreg(s, a->rm);
f->gen_s(t0, t0, t1, fpstatus_ptr(fpsttype));
- write_fp_sreg(s, a->rd, t0);
+ write_fp_sreg_merging(s, a->rd, mergereg, t0);
}
break;
case MO_16:
@@ -5066,7 +5128,7 @@ static bool do_fp3_scalar_with_fpsttype(DisasContext *s, arg_rrr_e *a,
TCGv_i32 t0 = read_fp_hreg(s, a->rn);
TCGv_i32 t1 = read_fp_hreg(s, a->rm);
f->gen_h(t0, t0, t1, fpstatus_ptr(fpsttype));
- write_fp_sreg(s, a->rd, t0);
+ write_fp_hreg_merging(s, a->rd, mergereg, t0);
}
break;
default:
@@ -5075,16 +5137,19 @@ static bool do_fp3_scalar_with_fpsttype(DisasContext *s, arg_rrr_e *a,
return true;
}
-static bool do_fp3_scalar(DisasContext *s, arg_rrr_e *a, const FPScalar *f)
+static bool do_fp3_scalar(DisasContext *s, arg_rrr_e *a, const FPScalar *f,
+ int mergereg)
{
- return do_fp3_scalar_with_fpsttype(s, a, f,
+ return do_fp3_scalar_with_fpsttype(s, a, f, mergereg,
a->esz == MO_16 ?
FPST_A64_F16 : FPST_A64);
}
-static bool do_fp3_scalar_ah(DisasContext *s, arg_rrr_e *a, const FPScalar *f)
+static bool do_fp3_scalar_ah(DisasContext *s, arg_rrr_e *a, const FPScalar *f,
+ int mergereg)
{
- return do_fp3_scalar_with_fpsttype(s, a, f, select_ah_fpst(s, a->esz));
+ return do_fp3_scalar_with_fpsttype(s, a, f, mergereg,
+ select_ah_fpst(s, a->esz));
}
static const FPScalar f_scalar_fadd = {
@@ -5092,63 +5157,63 @@ static const FPScalar f_scalar_fadd = {
gen_helper_vfp_adds,
gen_helper_vfp_addd,
};
-TRANS(FADD_s, do_fp3_scalar, a, &f_scalar_fadd)
+TRANS(FADD_s, do_fp3_scalar, a, &f_scalar_fadd, a->rn)
static const FPScalar f_scalar_fsub = {
gen_helper_vfp_subh,
gen_helper_vfp_subs,
gen_helper_vfp_subd,
};
-TRANS(FSUB_s, do_fp3_scalar, a, &f_scalar_fsub)
+TRANS(FSUB_s, do_fp3_scalar, a, &f_scalar_fsub, a->rn)
static const FPScalar f_scalar_fdiv = {
gen_helper_vfp_divh,
gen_helper_vfp_divs,
gen_helper_vfp_divd,
};
-TRANS(FDIV_s, do_fp3_scalar, a, &f_scalar_fdiv)
+TRANS(FDIV_s, do_fp3_scalar, a, &f_scalar_fdiv, a->rn)
static const FPScalar f_scalar_fmul = {
gen_helper_vfp_mulh,
gen_helper_vfp_muls,
gen_helper_vfp_muld,
};
-TRANS(FMUL_s, do_fp3_scalar, a, &f_scalar_fmul)
+TRANS(FMUL_s, do_fp3_scalar, a, &f_scalar_fmul, a->rn)
static const FPScalar f_scalar_fmax = {
gen_helper_vfp_maxh,
gen_helper_vfp_maxs,
gen_helper_vfp_maxd,
};
-TRANS(FMAX_s, do_fp3_scalar, a, &f_scalar_fmax)
+TRANS(FMAX_s, do_fp3_scalar, a, &f_scalar_fmax, a->rn)
static const FPScalar f_scalar_fmin = {
gen_helper_vfp_minh,
gen_helper_vfp_mins,
gen_helper_vfp_mind,
};
-TRANS(FMIN_s, do_fp3_scalar, a, &f_scalar_fmin)
+TRANS(FMIN_s, do_fp3_scalar, a, &f_scalar_fmin, a->rn)
static const FPScalar f_scalar_fmaxnm = {
gen_helper_vfp_maxnumh,
gen_helper_vfp_maxnums,
gen_helper_vfp_maxnumd,
};
-TRANS(FMAXNM_s, do_fp3_scalar, a, &f_scalar_fmaxnm)
+TRANS(FMAXNM_s, do_fp3_scalar, a, &f_scalar_fmaxnm, a->rn)
static const FPScalar f_scalar_fminnm = {
gen_helper_vfp_minnumh,
gen_helper_vfp_minnums,
gen_helper_vfp_minnumd,
};
-TRANS(FMINNM_s, do_fp3_scalar, a, &f_scalar_fminnm)
+TRANS(FMINNM_s, do_fp3_scalar, a, &f_scalar_fminnm, a->rn)
static const FPScalar f_scalar_fmulx = {
gen_helper_advsimd_mulxh,
gen_helper_vfp_mulxs,
gen_helper_vfp_mulxd,
};
-TRANS(FMULX_s, do_fp3_scalar, a, &f_scalar_fmulx)
+TRANS(FMULX_s, do_fp3_scalar, a, &f_scalar_fmulx, a->rn)
static void gen_fnmul_h(TCGv_i32 d, TCGv_i32 n, TCGv_i32 m, TCGv_ptr s)
{
@@ -5173,42 +5238,42 @@ static const FPScalar f_scalar_fnmul = {
gen_fnmul_s,
gen_fnmul_d,
};
-TRANS(FNMUL_s, do_fp3_scalar, a, &f_scalar_fnmul)
+TRANS(FNMUL_s, do_fp3_scalar, a, &f_scalar_fnmul, a->rn)
static const FPScalar f_scalar_fcmeq = {
gen_helper_advsimd_ceq_f16,
gen_helper_neon_ceq_f32,
gen_helper_neon_ceq_f64,
};
-TRANS(FCMEQ_s, do_fp3_scalar, a, &f_scalar_fcmeq)
+TRANS(FCMEQ_s, do_fp3_scalar, a, &f_scalar_fcmeq, a->rm)
static const FPScalar f_scalar_fcmge = {
gen_helper_advsimd_cge_f16,
gen_helper_neon_cge_f32,
gen_helper_neon_cge_f64,
};
-TRANS(FCMGE_s, do_fp3_scalar, a, &f_scalar_fcmge)
+TRANS(FCMGE_s, do_fp3_scalar, a, &f_scalar_fcmge, a->rm)
static const FPScalar f_scalar_fcmgt = {
gen_helper_advsimd_cgt_f16,
gen_helper_neon_cgt_f32,
gen_helper_neon_cgt_f64,
};
-TRANS(FCMGT_s, do_fp3_scalar, a, &f_scalar_fcmgt)
+TRANS(FCMGT_s, do_fp3_scalar, a, &f_scalar_fcmgt, a->rm)
static const FPScalar f_scalar_facge = {
gen_helper_advsimd_acge_f16,
gen_helper_neon_acge_f32,
gen_helper_neon_acge_f64,
};
-TRANS(FACGE_s, do_fp3_scalar, a, &f_scalar_facge)
+TRANS(FACGE_s, do_fp3_scalar, a, &f_scalar_facge, a->rm)
static const FPScalar f_scalar_facgt = {
gen_helper_advsimd_acgt_f16,
gen_helper_neon_acgt_f32,
gen_helper_neon_acgt_f64,
};
-TRANS(FACGT_s, do_fp3_scalar, a, &f_scalar_facgt)
+TRANS(FACGT_s, do_fp3_scalar, a, &f_scalar_facgt, a->rm)
static void gen_fabd_h(TCGv_i32 d, TCGv_i32 n, TCGv_i32 m, TCGv_ptr s)
{
@@ -5233,21 +5298,21 @@ static const FPScalar f_scalar_fabd = {
gen_fabd_s,
gen_fabd_d,
};
-TRANS(FABD_s, do_fp3_scalar, a, &f_scalar_fabd)
+TRANS(FABD_s, do_fp3_scalar, a, &f_scalar_fabd, a->rn)
static const FPScalar f_scalar_frecps = {
gen_helper_recpsf_f16,
gen_helper_recpsf_f32,
gen_helper_recpsf_f64,
};
-TRANS(FRECPS_s, do_fp3_scalar_ah, a, &f_scalar_frecps)
+TRANS(FRECPS_s, do_fp3_scalar_ah, a, &f_scalar_frecps, a->rn)
static const FPScalar f_scalar_frsqrts = {
gen_helper_rsqrtsf_f16,
gen_helper_rsqrtsf_f32,
gen_helper_rsqrtsf_f64,
};
-TRANS(FRSQRTS_s, do_fp3_scalar_ah, a, &f_scalar_frsqrts)
+TRANS(FRSQRTS_s, do_fp3_scalar_ah, a, &f_scalar_frsqrts, a->rn)
static bool do_fcmp0_s(DisasContext *s, arg_rr_e *a,
const FPScalar *f, bool swap)