@@ -2126,27 +2126,24 @@ static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2)
*/
static void do_fmlal(float32 *d, void *vn, void *vm, float_status *fpst,
- uint32_t desc, bool fz16)
+ uint64_t negx, int negf, uint32_t desc, bool fz16)
{
intptr_t i, oprsz = simd_oprsz(desc);
- int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
int is_q = oprsz == 16;
uint64_t n_4, m_4;
- /* Pre-load all of the f16 data, avoiding overlap issues. */
- n_4 = load4_f16(vn, is_q, is_2);
+ /*
+ * Pre-load all of the f16 data, avoiding overlap issues.
+ * Negate all inputs for AH=0 FMLSL at once.
+ */
+ n_4 = load4_f16(vn, is_q, is_2) ^ negx;
m_4 = load4_f16(vm, is_q, is_2);
- /* Negate all inputs for FMLSL at once. */
- if (is_s) {
- n_4 ^= 0x8000800080008000ull;
- }
-
for (i = 0; i < oprsz / 4; i++) {
float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16);
- d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
+ d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], negf, fpst);
}
clear_tail(d, oprsz, simd_maxsz(desc));
}
@@ -2154,14 +2151,28 @@ static void do_fmlal(float32 *d, void *vn, void *vm, float_status *fpst,
void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm,
CPUARMState *env, uint32_t desc)
{
- do_fmlal(vd, vn, vm, &env->vfp.fp_status[FPST_STD], desc,
+ bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
+ uint64_t negx = is_s ? 0x8000800080008000ull : 0;
+
+ do_fmlal(vd, vn, vm, &env->vfp.fp_status[FPST_STD], negx, 0, desc,
get_flush_inputs_to_zero(&env->vfp.fp_status[FPST_A32_F16]));
}
void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm,
CPUARMState *env, uint32_t desc)
{
- do_fmlal(vd, vn, vm, &env->vfp.fp_status[FPST_A64], desc,
+ bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
+ uint64_t negx = 0;
+ int negf = 0;
+
+ if (is_s) {
+ if (env->vfp.fpcr & FPCR_AH) {
+ negf = float_muladd_negate_product;
+ } else {
+ negx = 0x8000800080008000ull;
+ }
+ }
+ do_fmlal(vd, vn, vm, &env->vfp.fp_status[FPST_A64], negx, negf, desc,
get_flush_inputs_to_zero(&env->vfp.fp_status[FPST_A64_F16]));
}
@@ -2186,29 +2197,25 @@ void HELPER(sve2_fmlal_zzzw_s)(void *vd, void *vn, void *vm, void *va,
}
static void do_fmlal_idx(float32 *d, void *vn, void *vm, float_status *fpst,
- uint32_t desc, bool fz16)
+ uint64_t negx, int negf, uint32_t desc, bool fz16)
{
intptr_t i, oprsz = simd_oprsz(desc);
- int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3);
int is_q = oprsz == 16;
uint64_t n_4;
float32 m_1;
- /* Pre-load all of the f16 data, avoiding overlap issues. */
- n_4 = load4_f16(vn, is_q, is_2);
-
- /* Negate all inputs for FMLSL at once. */
- if (is_s) {
- n_4 ^= 0x8000800080008000ull;
- }
-
+ /*
+ * Pre-load all of the f16 data, avoiding overlap issues.
+ * Negate all inputs for AH=0 FMLSL at once.
+ */
+ n_4 = load4_f16(vn, is_q, is_2) ^ negx;
m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16);
for (i = 0; i < oprsz / 4; i++) {
float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
- d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
+ d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], negf, fpst);
}
clear_tail(d, oprsz, simd_maxsz(desc));
}
@@ -2216,14 +2223,28 @@ static void do_fmlal_idx(float32 *d, void *vn, void *vm, float_status *fpst,
void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm,
CPUARMState *env, uint32_t desc)
{
- do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status[FPST_STD], desc,
+ bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
+ uint64_t negx = is_s ? 0x8000800080008000ull : 0;
+
+ do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status[FPST_STD], negx, 0, desc,
get_flush_inputs_to_zero(&env->vfp.fp_status[FPST_A32_F16]));
}
void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm,
CPUARMState *env, uint32_t desc)
{
- do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status[FPST_A64], desc,
+ bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
+ uint64_t negx = 0;
+ int negf = 0;
+
+ if (is_s) {
+ if (env->vfp.fpcr & FPCR_AH) {
+ negf = float_muladd_negate_product;
+ } else {
+ negx = 0x8000800080008000ull;
+ }
+ }
+ do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status[FPST_A64], negx, negf, desc,
get_flush_inputs_to_zero(&env->vfp.fp_status[FPST_A64_F16]));
}
Signed-off-by: Richard Henderson <richard.henderson@linaro.org> --- target/arm/tcg/vec_helper.c | 71 ++++++++++++++++++++++++------------- 1 file changed, 46 insertions(+), 25 deletions(-)