@@ -2419,3 +2419,27 @@ DEF_HELPER_FLAGS_5(sve2_uabal_d, TCG_CALL_NO_RWG,
DEF_HELPER_FLAGS_5(sve2_adcl_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_5(sve2_adcl_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_3(sve2_sqxtnb_h, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sve2_sqxtnb_s, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sve2_sqxtnb_d, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_3(sve2_uqxtnb_h, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sve2_uqxtnb_s, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sve2_uqxtnb_d, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_3(sve2_sqxtunb_h, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sve2_sqxtunb_s, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sve2_sqxtunb_d, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_3(sve2_sqxtnt_h, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sve2_sqxtnt_s, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sve2_sqxtnt_d, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_3(sve2_uqxtnt_h, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sve2_uqxtnt_s, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sve2_uqxtnt_d, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_3(sve2_sqxtunt_h, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sve2_sqxtunt_s, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sve2_sqxtunt_d, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
@@ -1272,3 +1272,15 @@ SLI 01000101 .. 0 ..... 11110 1 ..... ..... @rd_rn_tszimm_shl
# TODO: Use @rda and %reg_movprfx here.
SABA 01000101 .. 0 ..... 11111 0 ..... ..... @rd_rn_rm
UABA 01000101 .. 0 ..... 11111 1 ..... ..... @rd_rn_rm
+
+#### SVE2 Narrowing
+
+## SVE2 saturating extract narrow
+
+# Bits 23, 18-16 are zero, limited in the translator via esz < 3 & imm == 0.
+SQXTNB 01000101 .. 1 ..... 010 000 ..... ..... @rd_rn_tszimm_shl
+SQXTNT 01000101 .. 1 ..... 010 001 ..... ..... @rd_rn_tszimm_shl
+UQXTNB 01000101 .. 1 ..... 010 010 ..... ..... @rd_rn_tszimm_shl
+UQXTNT 01000101 .. 1 ..... 010 011 ..... ..... @rd_rn_tszimm_shl
+SQXTUNB 01000101 .. 1 ..... 010 100 ..... ..... @rd_rn_tszimm_shl
+SQXTUNT 01000101 .. 1 ..... 010 101 ..... ..... @rd_rn_tszimm_shl
@@ -1264,6 +1264,62 @@ DO_ZZZW_ACC(sve2_uabal_d, uint64_t, uint32_t, , H1_4, DO_ABD)
#undef DO_ZZZW_ACC
+#define DO_XTNB(NAME, TYPE, OP) \
+void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
+{ \
+ intptr_t i, opr_sz = simd_oprsz(desc); \
+ for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
+ TYPE nn = *(TYPE *)(vn + i); \
+ nn = OP(nn) & MAKE_64BIT_MASK(0, sizeof(TYPE) * 4); \
+ *(TYPE *)(vd + i) = nn; \
+ } \
+}
+
+#define DO_XTNT(NAME, TYPE, TYPEN, H, OP) \
+void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
+{ \
+ intptr_t i, opr_sz = simd_oprsz(desc), odd = H(sizeof(TYPEN)); \
+ for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
+ TYPE nn = *(TYPE *)(vn + i); \
+ *(TYPEN *)(vd + i + odd) = OP(nn); \
+ } \
+}
+
+#define DO_SQXTN_H(n) do_sat_bhs(n, INT8_MIN, INT8_MAX)
+#define DO_SQXTN_S(n) do_sat_bhs(n, INT16_MIN, INT16_MAX)
+#define DO_SQXTN_D(n) do_sat_bhs(n, INT32_MIN, INT32_MAX)
+
+DO_XTNB(sve2_sqxtnb_h, int16_t, DO_SQXTN_H)
+DO_XTNB(sve2_sqxtnb_s, int32_t, DO_SQXTN_S)
+DO_XTNB(sve2_sqxtnb_d, int64_t, DO_SQXTN_D)
+
+DO_XTNT(sve2_sqxtnt_h, int16_t, int8_t, H1, DO_SQXTN_H)
+DO_XTNT(sve2_sqxtnt_s, int32_t, int16_t, H1_2, DO_SQXTN_S)
+DO_XTNT(sve2_sqxtnt_d, int64_t, int32_t, H1_4, DO_SQXTN_D)
+
+#define DO_UQXTN_H(n) do_sat_bhs(n, 0, UINT8_MAX)
+#define DO_UQXTN_S(n) do_sat_bhs(n, 0, UINT16_MAX)
+#define DO_UQXTN_D(n) do_sat_bhs(n, 0, UINT32_MAX)
+
+DO_XTNB(sve2_uqxtnb_h, uint16_t, DO_UQXTN_H)
+DO_XTNB(sve2_uqxtnb_s, uint32_t, DO_UQXTN_S)
+DO_XTNB(sve2_uqxtnb_d, uint64_t, DO_UQXTN_D)
+
+DO_XTNT(sve2_uqxtnt_h, uint16_t, uint8_t, H1, DO_UQXTN_H)
+DO_XTNT(sve2_uqxtnt_s, uint32_t, uint16_t, H1_2, DO_UQXTN_S)
+DO_XTNT(sve2_uqxtnt_d, uint64_t, uint32_t, H1_4, DO_UQXTN_D)
+
+DO_XTNB(sve2_sqxtunb_h, int16_t, DO_UQXTN_H)
+DO_XTNB(sve2_sqxtunb_s, int32_t, DO_UQXTN_S)
+DO_XTNB(sve2_sqxtunb_d, int64_t, DO_UQXTN_D)
+
+DO_XTNT(sve2_sqxtunt_h, int16_t, int8_t, H1, DO_UQXTN_H)
+DO_XTNT(sve2_sqxtunt_s, int32_t, int16_t, H1_2, DO_UQXTN_S)
+DO_XTNT(sve2_sqxtunt_d, int64_t, int32_t, H1_4, DO_UQXTN_D)
+
+#undef DO_XTNB
+#undef DO_XTNT
+
void HELPER(sve2_adcl_s)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
{
intptr_t i, opr_sz = simd_oprsz(desc);
@@ -6203,22 +6203,22 @@ static void gen_ushll_vec(unsigned vece, TCGv_vec d, TCGv_vec n, int64_t imm)
static bool do_sve2_shll_tb(DisasContext *s, arg_rri_esz *a,
bool sel, bool uns)
{
- static const TCGOpcode sshll_list[] = {
+ static const TCGOpcode sshll_list[] = {
INDEX_op_shli_vec, INDEX_op_sari_vec, 0
};
- static const TCGOpcode ushll_list[] = {
+ static const TCGOpcode ushll_list[] = {
INDEX_op_shli_vec, INDEX_op_shri_vec, 0
};
static const GVecGen2i ops[2][3] = {
- { { .fniv = gen_sshll_vec,
+ { { .fniv = gen_sshll_vec,
.opt_opc = sshll_list,
.fno = gen_helper_sve2_sshll_h,
.vece = MO_16 },
- { .fniv = gen_sshll_vec,
+ { .fniv = gen_sshll_vec,
.opt_opc = sshll_list,
.fno = gen_helper_sve2_sshll_s,
.vece = MO_32 },
- { .fniv = gen_sshll_vec,
+ { .fniv = gen_sshll_vec,
.opt_opc = sshll_list,
.fno = gen_helper_sve2_sshll_d,
.vece = MO_64 } },
@@ -6469,3 +6469,241 @@ static bool trans_UABA(DisasContext *s, arg_rrr_esz *a)
{
return do_sve2_fn_zzz(s, a, gen_gvec_uaba);
}
+
+static bool do_sve2_narrow_extract(DisasContext *s, arg_rri_esz *a,
+ const GVecGen2 ops[3])
+{
+ if (a->esz < 0 || a->esz > MO_32 || a->imm != 0 ||
+ !dc_isar_feature(aa64_sve2, s)) {
+ return false;
+ }
+ if (sve_access_check(s)) {
+ unsigned vsz = vec_full_reg_size(s);
+ tcg_gen_gvec_2(vec_full_reg_offset(s, a->rd),
+ vec_full_reg_offset(s, a->rn),
+ vsz, vsz, &ops[a->esz]);
+ }
+ return true;
+}
+
+static const TCGOpcode sqxtn_list[] = {
+ INDEX_op_shli_vec, INDEX_op_smin_vec, INDEX_op_smax_vec, 0
+};
+
+static void gen_sqxtnb_vec(unsigned vece, TCGv_vec d, TCGv_vec n)
+{
+ TCGv_vec t = tcg_temp_new_vec_matching(d);
+ int halfbits = 4 << vece;
+ int64_t mask = (1ull << halfbits) - 1;
+ int64_t min = -1ull << (halfbits - 1);
+ int64_t max = -min - 1;
+
+ tcg_gen_dupi_vec(vece, t, min);
+ tcg_gen_smax_vec(vece, d, n, t);
+ tcg_gen_dupi_vec(vece, t, max);
+ tcg_gen_smin_vec(vece, d, d, t);
+ tcg_gen_dupi_vec(vece, t, mask);
+ tcg_gen_and_vec(vece, d, d, t);
+ tcg_temp_free_vec(t);
+}
+
+static bool trans_SQXTNB(DisasContext *s, arg_rri_esz *a)
+{
+ static const GVecGen2 ops[3] = {
+ { .fniv = gen_sqxtnb_vec,
+ .opt_opc = sqxtn_list,
+ .fno = gen_helper_sve2_sqxtnb_h,
+ .vece = MO_16 },
+ { .fniv = gen_sqxtnb_vec,
+ .opt_opc = sqxtn_list,
+ .fno = gen_helper_sve2_sqxtnb_s,
+ .vece = MO_32 },
+ { .fniv = gen_sqxtnb_vec,
+ .opt_opc = sqxtn_list,
+ .fno = gen_helper_sve2_sqxtnb_d,
+ .vece = MO_64 },
+ };
+ return do_sve2_narrow_extract(s, a, ops);
+}
+
+static void gen_sqxtnt_vec(unsigned vece, TCGv_vec d, TCGv_vec n)
+{
+ TCGv_vec t = tcg_temp_new_vec_matching(d);
+ int halfbits = 4 << vece;
+ int64_t mask = (1ull << halfbits) - 1;
+ int64_t min = -1ull << (halfbits - 1);
+ int64_t max = -min - 1;
+
+ tcg_gen_dupi_vec(vece, t, min);
+ tcg_gen_smax_vec(vece, n, n, t);
+ tcg_gen_dupi_vec(vece, t, max);
+ tcg_gen_smin_vec(vece, n, n, t);
+ tcg_gen_shli_vec(vece, n, n, halfbits);
+ tcg_gen_dupi_vec(vece, t, mask);
+ tcg_gen_bitsel_vec(vece, d, t, d, n);
+ tcg_temp_free_vec(t);
+}
+
+static bool trans_SQXTNT(DisasContext *s, arg_rri_esz *a)
+{
+ static const GVecGen2 ops[3] = {
+ { .fniv = gen_sqxtnt_vec,
+ .opt_opc = sqxtn_list,
+ .load_dest = true,
+ .fno = gen_helper_sve2_sqxtnt_h,
+ .vece = MO_16 },
+ { .fniv = gen_sqxtnt_vec,
+ .opt_opc = sqxtn_list,
+ .load_dest = true,
+ .fno = gen_helper_sve2_sqxtnt_s,
+ .vece = MO_32 },
+ { .fniv = gen_sqxtnt_vec,
+ .opt_opc = sqxtn_list,
+ .load_dest = true,
+ .fno = gen_helper_sve2_sqxtnt_d,
+ .vece = MO_64 },
+ };
+ return do_sve2_narrow_extract(s, a, ops);
+}
+
+static const TCGOpcode uqxtn_list[] = {
+ INDEX_op_shli_vec, INDEX_op_umin_vec, 0
+};
+
+static void gen_uqxtnb_vec(unsigned vece, TCGv_vec d, TCGv_vec n)
+{
+ TCGv_vec t = tcg_temp_new_vec_matching(d);
+ int halfbits = 4 << vece;
+ int64_t max = (1ull << halfbits) - 1;
+
+ tcg_gen_dupi_vec(vece, t, max);
+ tcg_gen_umin_vec(vece, d, n, t);
+ tcg_temp_free_vec(t);
+}
+
+static bool trans_UQXTNB(DisasContext *s, arg_rri_esz *a)
+{
+ static const GVecGen2 ops[3] = {
+ { .fniv = gen_uqxtnb_vec,
+ .opt_opc = uqxtn_list,
+ .fno = gen_helper_sve2_uqxtnb_h,
+ .vece = MO_16 },
+ { .fniv = gen_uqxtnb_vec,
+ .opt_opc = uqxtn_list,
+ .fno = gen_helper_sve2_uqxtnb_s,
+ .vece = MO_32 },
+ { .fniv = gen_uqxtnb_vec,
+ .opt_opc = uqxtn_list,
+ .fno = gen_helper_sve2_uqxtnb_d,
+ .vece = MO_64 },
+ };
+ return do_sve2_narrow_extract(s, a, ops);
+}
+
+static void gen_uqxtnt_vec(unsigned vece, TCGv_vec d, TCGv_vec n)
+{
+ TCGv_vec t = tcg_temp_new_vec_matching(d);
+ int halfbits = 4 << vece;
+ int64_t max = (1ull << halfbits) - 1;
+
+ tcg_gen_dupi_vec(vece, t, max);
+ tcg_gen_umin_vec(vece, n, n, t);
+ tcg_gen_shli_vec(vece, n, n, halfbits);
+ tcg_gen_bitsel_vec(vece, d, t, d, n);
+ tcg_temp_free_vec(t);
+}
+
+static bool trans_UQXTNT(DisasContext *s, arg_rri_esz *a)
+{
+ static const GVecGen2 ops[3] = {
+ { .fniv = gen_uqxtnt_vec,
+ .opt_opc = uqxtn_list,
+ .load_dest = true,
+ .fno = gen_helper_sve2_uqxtnt_h,
+ .vece = MO_16 },
+ { .fniv = gen_uqxtnt_vec,
+ .opt_opc = uqxtn_list,
+ .load_dest = true,
+ .fno = gen_helper_sve2_uqxtnt_s,
+ .vece = MO_32 },
+ { .fniv = gen_uqxtnt_vec,
+ .opt_opc = uqxtn_list,
+ .load_dest = true,
+ .fno = gen_helper_sve2_uqxtnt_d,
+ .vece = MO_64 },
+ };
+ return do_sve2_narrow_extract(s, a, ops);
+}
+
+static const TCGOpcode sqxtun_list[] = {
+ INDEX_op_shli_vec, INDEX_op_umin_vec, INDEX_op_smax_vec, 0
+};
+
+static void gen_sqxtunb_vec(unsigned vece, TCGv_vec d, TCGv_vec n)
+{
+ TCGv_vec t = tcg_temp_new_vec_matching(d);
+ int halfbits = 4 << vece;
+ int64_t max = (1ull << halfbits) - 1;
+
+ tcg_gen_dupi_vec(vece, t, 0);
+ tcg_gen_smax_vec(vece, d, n, t);
+ tcg_gen_dupi_vec(vece, t, max);
+ tcg_gen_umin_vec(vece, d, d, t);
+ tcg_temp_free_vec(t);
+}
+
+static bool trans_SQXTUNB(DisasContext *s, arg_rri_esz *a)
+{
+ static const GVecGen2 ops[3] = {
+ { .fniv = gen_sqxtunb_vec,
+ .opt_opc = sqxtun_list,
+ .fno = gen_helper_sve2_sqxtunb_h,
+ .vece = MO_16 },
+ { .fniv = gen_sqxtunb_vec,
+ .opt_opc = sqxtun_list,
+ .fno = gen_helper_sve2_sqxtunb_s,
+ .vece = MO_32 },
+ { .fniv = gen_sqxtunb_vec,
+ .opt_opc = sqxtun_list,
+ .fno = gen_helper_sve2_sqxtunb_d,
+ .vece = MO_64 },
+ };
+ return do_sve2_narrow_extract(s, a, ops);
+}
+
+static void gen_sqxtunt_vec(unsigned vece, TCGv_vec d, TCGv_vec n)
+{
+ TCGv_vec t = tcg_temp_new_vec_matching(d);
+ int halfbits = 4 << vece;
+ int64_t max = (1ull << halfbits) - 1;
+
+ tcg_gen_dupi_vec(vece, t, 0);
+ tcg_gen_smax_vec(vece, n, n, t);
+ tcg_gen_dupi_vec(vece, t, max);
+ tcg_gen_umin_vec(vece, n, n, t);
+ tcg_gen_shli_vec(vece, n, n, halfbits);
+ tcg_gen_bitsel_vec(vece, d, t, d, n);
+ tcg_temp_free_vec(t);
+}
+
+static bool trans_SQXTUNT(DisasContext *s, arg_rri_esz *a)
+{
+ static const GVecGen2 ops[3] = {
+ { .fniv = gen_sqxtunt_vec,
+ .opt_opc = sqxtun_list,
+ .load_dest = true,
+ .fno = gen_helper_sve2_sqxtunt_h,
+ .vece = MO_16 },
+ { .fniv = gen_sqxtunt_vec,
+ .opt_opc = sqxtun_list,
+ .load_dest = true,
+ .fno = gen_helper_sve2_sqxtunt_s,
+ .vece = MO_32 },
+ { .fniv = gen_sqxtunt_vec,
+ .opt_opc = sqxtun_list,
+ .load_dest = true,
+ .fno = gen_helper_sve2_sqxtunt_d,
+ .vece = MO_64 },
+ };
+ return do_sve2_narrow_extract(s, a, ops);
+}
Signed-off-by: Richard Henderson <richard.henderson@linaro.org> --- target/arm/helper-sve.h | 24 ++++ target/arm/sve.decode | 12 ++ target/arm/sve_helper.c | 56 +++++++++ target/arm/translate-sve.c | 248 ++++++++++++++++++++++++++++++++++++- 4 files changed, 335 insertions(+), 5 deletions(-)