@@ -557,6 +557,23 @@ DEF_HELPER_FLAGS_5(gvec_qrdmlah_s32, TCG_CALL_NO_RWG,
DEF_HELPER_FLAGS_5(gvec_qrdmlsh_s32, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_sqrdmlah_b, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_sqrdmlsh_b, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_sqrdmlah_h, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_sqrdmlsh_h, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_sqrdmlah_s, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_sqrdmlsh_s, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_sqrdmlah_d, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_sqrdmlsh_d, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+
DEF_HELPER_FLAGS_4(gvec_sdot_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(gvec_udot_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(gvec_sdot_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
@@ -1346,3 +1346,8 @@ SQDMLSLT_zzzw 01000100 .. 0 ..... 0110 11 ..... ..... @rda_rn_rm
SQDMLALBT 01000100 .. 0 ..... 00001 0 ..... ..... @rda_rn_rm
SQDMLSLBT 01000100 .. 0 ..... 00001 1 ..... ..... @rda_rn_rm
+
+## SVE2 saturating multiply-add high
+
+SQRDMLAH_zzzz 01000100 .. 0 ..... 01110 0 ..... ..... @rda_rn_rm
+SQRDMLSH_zzzz 01000100 .. 0 ..... 01110 1 ..... ..... @rda_rn_rm
@@ -7123,3 +7123,21 @@ static bool trans_SQDMLSLBT(DisasContext *s, arg_rrrr_esz *a)
{
return do_sqdmlsl_zzzw(s, a, false, true);
}
+
+static bool trans_SQRDMLAH_zzzz(DisasContext *s, arg_rrrr_esz *a)
+{
+ static gen_helper_gvec_4 * const fns[] = {
+ gen_helper_sve2_sqrdmlah_b, gen_helper_sve2_sqrdmlah_h,
+ gen_helper_sve2_sqrdmlah_s, gen_helper_sve2_sqrdmlah_d,
+ };
+ return do_sve2_zzzz_ool(s, a, fns[a->esz], 0);
+}
+
+static bool trans_SQRDMLSH_zzzz(DisasContext *s, arg_rrrr_esz *a)
+{
+ static gen_helper_gvec_4 * const fns[] = {
+ gen_helper_sve2_sqrdmlsh_b, gen_helper_sve2_sqrdmlsh_h,
+ gen_helper_sve2_sqrdmlsh_s, gen_helper_sve2_sqrdmlsh_d,
+ };
+ return do_sve2_zzzz_ool(s, a, fns[a->esz], 0);
+}
@@ -22,6 +22,7 @@
#include "exec/helper-proto.h"
#include "tcg/tcg-gvec-desc.h"
#include "fpu/softfloat.h"
+#include "qemu/int128.h"
#include "vec_internal.h"
/* Note that vector data is stored in host-endian 64-bit chunks,
@@ -36,15 +37,55 @@
#define H4(x) (x)
#endif
+/* Signed saturating rounding doubling multiply-accumulate high half, 8-bit */
+static int8_t do_sqrdmlah_b(int8_t src1, int8_t src2, int8_t src3,
+ bool neg, bool round)
+{
+ /*
+ * Simplify:
+ * = ((a3 << 8) + ((e1 * e2) << 1) + (round << 7)) >> 8
+ * = ((a3 << 7) + (e1 * e2) + (round << 6)) >> 7
+ */
+ int32_t ret = (int32_t)src1 * src2;
+ if (neg) {
+ ret = -ret;
+ }
+ ret += ((int32_t)src3 << 7) + (round << 6);
+ ret >>= 7;
+
+ if (ret != (int8_t)ret) {
+ ret = (ret < 0 ? INT8_MIN : INT8_MAX);
+ }
+ return ret;
+}
+
+void HELPER(sve2_sqrdmlah_b)(void *vd, void *vn, void *vm,
+ void *va, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc);
+ int8_t *d = vd, *n = vn, *m = vm, *a = va;
+
+ for (i = 0; i < opr_sz; ++i) {
+ d[i] = do_sqrdmlah_b(n[i], m[i], a[i], false, true);
+ }
+}
+
+void HELPER(sve2_sqrdmlsh_b)(void *vd, void *vn, void *vm,
+ void *va, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc);
+ int8_t *d = vd, *n = vn, *m = vm, *a = va;
+
+ for (i = 0; i < opr_sz; ++i) {
+ d[i] = do_sqrdmlah_b(n[i], m[i], a[i], true, true);
+ }
+}
+
/* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */
static int16_t do_sqrdmlah_h(int16_t src1, int16_t src2, int16_t src3,
bool neg, bool round, uint32_t *sat)
{
- /*
- * Simplify:
- * = ((a3 << 16) + ((e1 * e2) << 1) + (1 << 15)) >> 16
- * = ((a3 << 15) + (e1 * e2) + (1 << 14)) >> 15
- */
+ /* Simplify similarly to do_sqrdmlah_b above. */
int32_t ret = (int32_t)src1 * src2;
if (neg) {
ret = -ret;
@@ -109,16 +150,40 @@ void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm,
clear_tail(d, opr_sz, simd_maxsz(desc));
}
+void HELPER(sve2_sqrdmlah_h)(void *vd, void *vn, void *vm,
+ void *va, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc);
+ int16_t *d = vd, *n = vn, *m = vm, *a = va;
+ uint32_t discard;
+
+ for (i = 0; i < opr_sz / 2; ++i) {
+ d[i] = do_sqrdmlah_h(n[i], m[i], a[i], false, true, &discard);
+ }
+}
+
+void HELPER(sve2_sqrdmlsh_h)(void *vd, void *vn, void *vm,
+ void *va, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc);
+ int16_t *d = vd, *n = vn, *m = vm, *a = va;
+ uint32_t discard;
+
+ for (i = 0; i < opr_sz / 2; ++i) {
+ d[i] = do_sqrdmlah_h(n[i], m[i], a[i], true, true, &discard);
+ }
+}
+
/* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */
static int32_t do_sqrdmlah_s(int32_t src1, int32_t src2, int32_t src3,
bool neg, bool round, uint32_t *sat)
{
- /* Simplify similarly to int_qrdmlah_s16 above. */
+ /* Simplify similarly to do_sqrdmlah_b above. */
int64_t ret = (int64_t)src1 * src2;
if (neg) {
ret = -ret;
}
- ret = ((int64_t)src3 << 31) + (round << 30);
+ ret += ((int64_t)src3 << 31) + (round << 30);
ret >>= 31;
if (ret != (int32_t)ret) {
@@ -172,6 +237,90 @@ void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm,
clear_tail(d, opr_sz, simd_maxsz(desc));
}
+void HELPER(sve2_sqrdmlah_s)(void *vd, void *vn, void *vm,
+ void *va, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc);
+ int32_t *d = vd, *n = vn, *m = vm, *a = va;
+ uint32_t discard;
+
+ for (i = 0; i < opr_sz / 4; ++i) {
+ d[i] = do_sqrdmlah_s(n[i], m[i], a[i], false, true, &discard);
+ }
+}
+
+void HELPER(sve2_sqrdmlsh_s)(void *vd, void *vn, void *vm,
+ void *va, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc);
+ int32_t *d = vd, *n = vn, *m = vm, *a = va;
+ uint32_t discard;
+
+ for (i = 0; i < opr_sz / 4; ++i) {
+ d[i] = do_sqrdmlah_s(n[i], m[i], a[i], true, true, &discard);
+ }
+}
+
+/* Signed saturating rounding doubling multiply-accumulate high half, 64-bit */
+static int64_t do_sat128_d(Int128 r)
+{
+ int64_t ls = int128_getlo(r);
+ int64_t hs = int128_gethi(r);
+
+ if (unlikely(hs != (ls >> 63))) {
+ return hs < 0 ? INT64_MIN : INT64_MAX;
+ }
+ return ls;
+}
+
+static int64_t do_sqrdmlah_d(int64_t n, int64_t m, int64_t a,
+ bool neg, bool round)
+{
+ uint64_t l, h;
+ Int128 r, t;
+
+ /* As in do_sqrdmlah_b, but with 128-bit arithmetic. */
+ muls64(&l, &h, m, n);
+ r = int128_make128(l, h);
+ if (neg) {
+ r = int128_neg(r);
+ }
+ if (a) {
+ t = int128_exts64(a);
+ t = int128_lshift(t, 63);
+ r = int128_add(r, t);
+ }
+ if (round) {
+ t = int128_exts64(1ll << 62);
+ r = int128_add(r, t);
+ }
+ r = int128_rshift(r, 63);
+
+ return do_sat128_d(r);
+}
+
+void HELPER(sve2_sqrdmlah_d)(void *vd, void *vn, void *vm,
+ void *va, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc);
+ int64_t *d = vd, *n = vn, *m = vm, *a = va;
+
+ for (i = 0; i < opr_sz / 8; ++i) {
+ d[i] = do_sqrdmlah_d(n[i], m[i], a[i], false, true);
+ }
+}
+
+void HELPER(sve2_sqrdmlsh_d)(void *vd, void *vn, void *vm,
+ void *va, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc);
+ int64_t *d = vd, *n = vn, *m = vm, *a = va;
+
+ for (i = 0; i < opr_sz / 8; ++i) {
+ d[i] = do_sqrdmlah_d(n[i], m[i], a[i], true, true);
+ }
+}
+
/* Integer 8 and 16-bit dot-product.
*
* Note that for the loops herein, host endianness does not matter
SVE2 has two additional sizes of the operation and unlike NEON, there is no saturation flag. Create new entry points for SVE2 that do not set QC. Signed-off-by: Richard Henderson <richard.henderson@linaro.org> --- target/arm/helper.h | 17 ++++ target/arm/sve.decode | 5 ++ target/arm/translate-sve.c | 18 ++++ target/arm/vec_helper.c | 163 +++++++++++++++++++++++++++++++++++-- 4 files changed, 196 insertions(+), 7 deletions(-) -- 2.25.1