@@ -1574,3 +1574,5 @@ DEF_HELPER_FLAGS_6(sve_stdd_le_zd, TCG_CALL_NO_WG,
void, env, ptr, ptr, ptr, tl, i32)
DEF_HELPER_FLAGS_6(sve_stdd_be_zd, TCG_CALL_NO_WG,
void, env, ptr, ptr, ptr, tl, i32)
+
+DEF_HELPER_FLAGS_4(sve2_pmull_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
@@ -342,7 +342,6 @@ DEF_HELPER_2(neon_sub_u8, i32, i32, i32)
DEF_HELPER_2(neon_sub_u16, i32, i32, i32)
DEF_HELPER_2(neon_mul_u8, i32, i32, i32)
DEF_HELPER_2(neon_mul_u16, i32, i32, i32)
-DEF_HELPER_2(neon_mull_p8, i64, i32, i32)
DEF_HELPER_2(neon_tst_u8, i32, i32, i32)
DEF_HELPER_2(neon_tst_u16, i32, i32, i32)
@@ -695,6 +694,8 @@ DEF_HELPER_FLAGS_4(gvec_ushl_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(gvec_pmul_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(gvec_pmull_q, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(neon_pmull_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
#ifdef TARGET_AARCH64
#include "helper-a64.h"
#include "helper-sve.h"
@@ -1129,38 +1129,6 @@ NEON_VOP(mul_u8, neon_u8, 4)
NEON_VOP(mul_u16, neon_u16, 2)
#undef NEON_FN
-/* Polynomial multiplication is like integer multiplication except the
- partial products are XORed, not added. */
-uint64_t HELPER(neon_mull_p8)(uint32_t op1, uint32_t op2)
-{
- uint64_t result = 0;
- uint64_t mask;
- uint64_t op2ex = op2;
- op2ex = (op2ex & 0xff) |
- ((op2ex & 0xff00) << 8) |
- ((op2ex & 0xff0000) << 16) |
- ((op2ex & 0xff000000) << 24);
- while (op1) {
- mask = 0;
- if (op1 & 1) {
- mask |= 0xffff;
- }
- if (op1 & (1 << 8)) {
- mask |= (0xffffU << 16);
- }
- if (op1 & (1 << 16)) {
- mask |= (0xffffULL << 32);
- }
- if (op1 & (1 << 24)) {
- mask |= (0xffffULL << 48);
- }
- result ^= op2ex & mask;
- op1 = (op1 >> 1) & 0x7f7f7f7f;
- op2ex <<= 1;
- }
- return result;
-}
-
#define NEON_FN(dest, src1, src2) dest = (src1 & src2) ? -1 : 0
NEON_VOP(tst_u8, neon_u8, 4)
NEON_VOP(tst_u16, neon_u16, 2)
@@ -10533,10 +10533,6 @@ static void handle_3rd_widening(DisasContext *s, int is_q, int is_u, int size,
gen_helper_neon_addl_saturate_s32(tcg_passres, cpu_env,
tcg_passres, tcg_passres);
break;
- case 14: /* PMULL */
- assert(size == 0);
- gen_helper_neon_mull_p8(tcg_passres, tcg_op1, tcg_op2);
- break;
default:
g_assert_not_reached();
}
@@ -10700,11 +10696,21 @@ static void disas_simd_three_reg_diff(DisasContext *s, uint32_t insn)
handle_3rd_narrowing(s, is_q, is_u, size, opcode, rd, rn, rm);
break;
case 14: /* PMULL, PMULL2 */
- if (is_u || size == 1 || size == 2) {
+ if (is_u) {
unallocated_encoding(s);
return;
}
- if (size == 3) {
+ switch (size) {
+ case 0: /* PMULL.P8 */
+ if (!fp_access_check(s)) {
+ return;
+ }
+ /* The Q field specifies lo/hi half input for this insn. */
+ gen_gvec_op3_ool(s, true, rd, rn, rm, is_q,
+ gen_helper_neon_pmull_h);
+ break;
+
+ case 3: /* PMULL.P64 */
if (!dc_isar_feature(aa64_pmull, s)) {
unallocated_encoding(s);
return;
@@ -10715,9 +10721,13 @@ static void disas_simd_three_reg_diff(DisasContext *s, uint32_t insn)
/* The Q field specifies lo/hi half input for this insn. */
gen_gvec_op3_ool(s, true, rd, rn, rm, is_q,
gen_helper_gvec_pmull_q);
- return;
+ break;
+
+ default:
+ unallocated_encoding(s);
+ break;
}
- goto is_widening;
+ return;
case 9: /* SQDMLAL, SQDMLAL2 */
case 11: /* SQDMLSL, SQDMLSL2 */
case 13: /* SQDMULL, SQDMULL2 */
@@ -10738,7 +10748,6 @@ static void disas_simd_three_reg_diff(DisasContext *s, uint32_t insn)
unallocated_encoding(s);
return;
}
- is_widening:
if (!fp_access_check(s)) {
return;
}
@@ -5866,15 +5866,20 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn)
return 1;
}
- /* Handle VMULL.P64 (Polynomial 64x64 to 128 bit multiply)
- * outside the loop below as it only performs a single pass.
- */
- if (op == 14 && size == 2) {
- if (!dc_isar_feature(aa32_pmull, s)) {
- return 1;
+ /* Handle polynomial VMULL in a single pass. */
+ if (op == 14) {
+ if (size == 0) {
+ /* VMULL.P8 */
+ tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, 16, 16,
+ 0, gen_helper_neon_pmull_h);
+ } else {
+ /* VMULL.P64 */
+ if (!dc_isar_feature(aa32_pmull, s)) {
+ return 1;
+ }
+ tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, 16, 16,
+ 0, gen_helper_gvec_pmull_q);
}
- tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, 16, 16,
- 0, gen_helper_gvec_pmull_q);
return 0;
}
@@ -5952,11 +5957,6 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn)
/* VMLAL, VQDMLAL, VMLSL, VQDMLSL, VMULL, VQDMULL */
gen_neon_mull(cpu_V0, tmp, tmp2, size, u);
break;
- case 14: /* Polynomial VMULL */
- gen_helper_neon_mull_p8(cpu_V0, tmp, tmp2);
- tcg_temp_free_i32(tmp2);
- tcg_temp_free_i32(tmp);
- break;
default: /* 15 is RESERVED: caught earlier */
abort();
}
@@ -1197,3 +1197,63 @@ void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc)
}
clear_tail(d, opr_sz, simd_maxsz(desc));
}
+
+/*
+ * 8x8->16 polynomial multiply.
+ *
+ * The byte inputs are expanded to (or extracted from) half-words.
+ * Note that neon and sve2 get the inputs from different positions.
+ * This allows 4 bytes to be processed in parallel with uint64_t.
+ */
+
+static uint64_t expand_byte_to_half(uint64_t x)
+{
+ return (x & 0x000000ff)
+ | ((x & 0x0000ff00) << 8)
+ | ((x & 0x00ff0000) << 16)
+ | ((x & 0xff000000) << 24);
+}
+
+static uint64_t pmull_h(uint64_t op1, uint64_t op2)
+{
+ uint64_t result = 0;
+ int i;
+
+ for (i = 0; i < 8; ++i) {
+ uint64_t mask = (op1 & 0x0001000100010001ull) * 0xffff;
+ result ^= op2 & mask;
+ op1 >>= 1;
+ op2 <<= 1;
+ }
+ return result;
+}
+
+void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+ int hi = simd_data(desc);
+ uint64_t *d = vd, *n = vn, *m = vm;
+ uint64_t nn = n[hi], mm = m[hi];
+
+ d[0] = pmull_h(expand_byte_to_half(nn), expand_byte_to_half(mm));
+ nn >>= 32;
+ mm >>= 32;
+ d[1] = pmull_h(expand_byte_to_half(nn), expand_byte_to_half(mm));
+
+ clear_tail(d, 16, simd_maxsz(desc));
+}
+
+#ifdef TARGET_AARCH64
+void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+ int shift = simd_data(desc) * 8;
+ intptr_t i, opr_sz = simd_oprsz(desc);
+ uint64_t *d = vd, *n = vn, *m = vm;
+
+ for (i = 0; i < opr_sz / 8; ++i) {
+ uint64_t nn = (n[i] >> shift) & 0x00ff00ff00ff00ffull;
+ uint64_t mm = (m[i] >> shift) & 0x00ff00ff00ff00ffull;
+
+ d[i] = pmull_h(nn, mm);
+ }
+}
+#endif