Message ID | 20200925152047.709901-7-richard.henderson@linaro.org |
---|---|
State | New |
Headers | show |
Series | softfloat: Implement float128_muladd | expand |
Richard Henderson <richard.henderson@linaro.org> writes: > Signed-off-by: Richard Henderson <richard.henderson@linaro.org> > --- > include/fpu/softfloat.h | 2 + > fpu/softfloat.c | 416 +++++++++++++++++++++++++++++++++++++++- > tests/fp/fp-test.c | 2 +- > tests/fp/wrap.c.inc | 12 ++ > 4 files changed, 430 insertions(+), 2 deletions(-) > > diff --git a/include/fpu/softfloat.h b/include/fpu/softfloat.h > index 78ad5ca738..a38433deb4 100644 > --- a/include/fpu/softfloat.h > +++ b/include/fpu/softfloat.h > @@ -1196,6 +1196,8 @@ float128 float128_sub(float128, float128, float_status *status); > float128 float128_mul(float128, float128, float_status *status); > float128 float128_div(float128, float128, float_status *status); > float128 float128_rem(float128, float128, float_status *status); > +float128 float128_muladd(float128, float128, float128, int, > + float_status *status); > float128 float128_sqrt(float128, float_status *status); > FloatRelation float128_compare(float128, float128, float_status *status); > FloatRelation float128_compare_quiet(float128, float128, float_status *status); > diff --git a/fpu/softfloat.c b/fpu/softfloat.c > index e038434a07..49de31fec2 100644 > --- a/fpu/softfloat.c > +++ b/fpu/softfloat.c > @@ -512,11 +512,19 @@ static inline __attribute__((unused)) bool is_qnan(FloatClass c) > > typedef struct { > uint64_t frac; > - int32_t exp; > + int32_t exp; > FloatClass cls; > bool sign; > } FloatParts; > > +/* Similar for float128. */ > +typedef struct { > + uint64_t frac0, frac1; > + int32_t exp; > + FloatClass cls; > + bool sign; > +} FloatParts128; > + > #define DECOMPOSED_BINARY_POINT (64 - 2) > #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT) > #define DECOMPOSED_OVERFLOW_BIT (DECOMPOSED_IMPLICIT_BIT << 1) > @@ -4574,6 +4582,46 @@ static void > > } > > +/*---------------------------------------------------------------------------- > +| Returns the parts of floating-point value `a'. > +*----------------------------------------------------------------------------*/ > + > +static void float128_unpack(FloatParts128 *p, float128 a, float_status *status) > +{ > + p->sign = extractFloat128Sign(a); > + p->exp = extractFloat128Exp(a); > + p->frac0 = extractFloat128Frac0(a); > + p->frac1 = extractFloat128Frac1(a); Here we are deviating from the exiting style, it would be nice if we could separate the raw unpack and have something like: static const FloatFmt float128_params = { FLOAT_PARAMS(15, 112) } static inline FloatParts128 unpack128_raw(FloatFmt fmt, uint128_t raw) { const int sign_pos = fmt.frac_size + fmt.exp_size; return (FloatParts128) { .cls = float_class_unclassified, .sign = extract128(raw, sign_pos, 1), .exp = extract128(raw, fmt.frac_size, fmt.exp_size), .frac1 = extract128_lo(raw, 0, fmt.frac_size), .frac2 = extract128_hi(raw, 0, fmt.frac_size), }; } So even if we end up duplicating a chunk of the code the form will be similar so when we side-by-side the logic we can see it works the same way. > + > + if (p->exp == 0) { > + if ((p->frac0 | p->frac1) == 0) { > + p->cls = float_class_zero; > + } else if (status->flush_inputs_to_zero) { > + float_raise(float_flag_input_denormal, status); > + p->cls = float_class_zero; > + p->frac0 = p->frac1 = 0; > + } else { > + normalizeFloat128Subnormal(p->frac0, p->frac1, &p->exp, > + &p->frac0, &p->frac1); > + p->exp -= 0x3fff; > + p->cls = float_class_normal; > + } and also we can get avoid introducing the magic numbers into the code. > + } else if (p->exp == 0x7fff) { > + if ((p->frac0 | p->frac1) == 0) { > + p->cls = float_class_inf; > + } else if (float128_is_signaling_nan(a, status)) { > + p->cls = float_class_snan; > + } else { > + p->cls = float_class_qnan; > + } > + } else { > + /* Add the implicit bit. */ > + p->frac0 |= UINT64_C(0x0001000000000000); > + p->exp -= 0x3fff; > + p->cls = float_class_normal; > + } > +} > + and eventually hold out for compilers smart enough to handle unification at a later date. > /*---------------------------------------------------------------------------- > | Packs the sign `zSign', the exponent `zExp', and the significand formed > | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision > @@ -7205,6 +7253,372 @@ float128 float128_mul(float128 a, float128 b, float_status *status) > > } > > +typedef struct UInt256 { > + /* Indexed big-endian, to match the rest of softfloat numbering. */ > + uint64_t w[4]; > +} UInt256; > + > +static inline uint64_t shl_double(uint64_t h, uint64_t l, unsigned lsh) > +{ > + return lsh ? (h << lsh) | (l >> (64 - lsh)) : h; > +} > + > +static inline uint64_t shr_double(uint64_t h, uint64_t l, unsigned rsh) > +{ > + return rsh ? (h << (64 - rsh)) | (l >> rsh) : l; > +} > + > +static void shortShift256Left(UInt256 *p, unsigned lsh) > +{ > + if (lsh != 0) { > + p->w[0] = shl_double(p->w[0], p->w[1], lsh); > + p->w[1] = shl_double(p->w[1], p->w[2], lsh); > + p->w[2] = shl_double(p->w[2], p->w[3], lsh); > + p->w[3] <<= lsh; > + } > +} > + > +static inline void shift256RightJamming(UInt256 *p, unsigned count) > +{ > + uint64_t out, p0, p1, p2, p3; > + > + p0 = p->w[0]; > + p1 = p->w[1]; > + p2 = p->w[2]; > + p3 = p->w[3]; > + > + if (unlikely(count == 0)) { > + return; > + } else if (likely(count < 64)) { > + out = 0; > + } else if (likely(count < 256)) { > + if (count < 128) { > + out = p3; > + p3 = p2; > + p2 = p1; > + p1 = p0; > + p0 = 0; > + } else if (count < 192) { > + out = p2 | p3; > + p3 = p1; > + p2 = p0; > + p1 = 0; > + p0 = 0; > + } else { > + out = p1 | p2 | p3; > + p3 = p0; > + p2 = 0; > + p1 = 0; > + p0 = 0; > + } > + count &= 63; > + if (count == 0) { > + goto done; > + } > + } else { > + out = p0 | p1 | p2 | p3; > + p3 = 0; > + p2 = 0; > + p1 = 0; > + p0 = 0; > + goto done; > + } > + > + out |= shr_double(p3, 0, count); > + p3 = shr_double(p2, p3, count); > + p2 = shr_double(p1, p2, count); > + p1 = shr_double(p0, p1, count); > + p0 = p0 >> count; > + > + done: > + p->w[3] = p3 | (out != 0); > + p->w[2] = p2; > + p->w[1] = p1; > + p->w[0] = p0; > +} > + > +/* R = A - B */ > +static void sub256(UInt256 *r, UInt256 *a, UInt256 *b) > +{ > + bool borrow = false; > + > + for (int i = 3; i >= 0; --i) { > + uint64_t at = a->w[i]; > + uint64_t bt = b->w[i]; > + uint64_t rt = at - bt; > + > + if (borrow) { > + borrow = at <= bt; > + rt -= 1; > + } else { > + borrow = at < bt; > + } > + r->w[i] = rt; > + } > +} > + > +/* A = -A */ > +static void neg256(UInt256 *a) > +{ > + /* > + * Recall that -X - 1 = ~X, and that since this is negation, > + * once we find a non-zero number, all subsequent words will > + * have borrow-in, and thus use NOT. > + */ > + uint64_t t = a->w[3]; > + if (likely(t)) { > + a->w[3] = -t; > + goto not2; > + } > + t = a->w[2]; > + if (likely(t)) { > + a->w[2] = -t; > + goto not1; > + } > + t = a->w[1]; > + if (likely(t)) { > + a->w[1] = -t; > + goto not0; > + } > + a->w[0] = -a->w[0]; > + return; > + not2: > + a->w[2] = ~a->w[2]; > + not1: > + a->w[1] = ~a->w[1]; > + not0: > + a->w[0] = ~a->w[0]; > +} > + > +/* A += B */ > +static void add256(UInt256 *a, UInt256 *b) > +{ > + bool carry = false; > + > + for (int i = 3; i >= 0; --i) { > + uint64_t bt = b->w[i]; > + uint64_t at = a->w[i] + bt; > + > + if (carry) { > + at += 1; > + carry = at <= bt; > + } else { > + carry = at < bt; > + } > + a->w[i] = at; > + } > +} > + > +float128 float128_muladd(float128 a_f, float128 b_f, float128 c_f, > + int flags, float_status *status) > +{ > + bool inf_zero, p_sign, sign_flip; > + int p_exp, exp_diff, shift, ab_mask, abc_mask; > + FloatParts128 a, b, c; > + FloatClass p_cls; > + UInt256 p_frac, c_frac; > + > + float128_unpack(&a, a_f, status); > + float128_unpack(&b, b_f, status); > + float128_unpack(&c, c_f, status); > + > + ab_mask = float_cmask(a.cls) | float_cmask(b.cls); > + abc_mask = float_cmask(c.cls) | ab_mask; > + inf_zero = ab_mask == float_cmask_infzero; > + > + /* If any input is a NaN, select the required result. */ > + if (unlikely(abc_mask & float_cmask_anynan)) { > + if (unlikely(abc_mask & float_cmask_snan)) { > + float_raise(float_flag_invalid, status); > + } > + > + int which = pickNaNMulAdd(a.cls, b.cls, c.cls, inf_zero, status); > + if (status->default_nan_mode) { > + which = 3; > + } > + switch (which) { > + case 0: > + break; > + case 1: > + a_f = b_f; > + a.cls = b.cls; > + break; > + case 2: > + a_f = c_f; > + a.cls = c.cls; > + break; > + case 3: > + return float128_default_nan(status); > + } > + if (is_snan(a.cls)) { > + return float128_silence_nan(a_f, status); > + } > + return a_f; > + } > + > + /* After dealing with input NaNs, look for Inf * Zero. */ > + if (unlikely(inf_zero)) { > + float_raise(float_flag_invalid, status); > + return float128_default_nan(status); > + } > + > + p_sign = a.sign ^ b.sign; > + > + if (flags & float_muladd_negate_c) { > + c.sign ^= 1; > + } > + if (flags & float_muladd_negate_product) { > + p_sign ^= 1; > + } > + sign_flip = (flags & float_muladd_negate_result); > + > + if (ab_mask & float_cmask_inf) { > + p_cls = float_class_inf; > + } else if (ab_mask & float_cmask_zero) { > + p_cls = float_class_zero; > + } else { > + p_cls = float_class_normal; > + } > + > + if (c.cls == float_class_inf) { > + if (p_cls == float_class_inf && p_sign != c.sign) { > + /* +Inf + -Inf = NaN */ > + float_raise(float_flag_invalid, status); > + return float128_default_nan(status); > + } > + /* Inf + Inf = Inf of the proper sign; reuse the return below. */ > + p_cls = float_class_inf; > + p_sign = c.sign; > + } > + > + if (p_cls == float_class_inf) { > + return packFloat128(p_sign ^ sign_flip, 0x7fff, 0, 0); > + } > + > + if (p_cls == float_class_zero) { > + if (c.cls == float_class_zero) { > + if (p_sign != c.sign) { > + p_sign = status->float_rounding_mode == float_round_down; > + } > + return packFloat128(p_sign ^ sign_flip, 0, 0, 0); > + } > + > + if (flags & float_muladd_halve_result) { > + c.exp -= 1; > + } > + return roundAndPackFloat128(c.sign ^ sign_flip, > + c.exp + 0x3fff - 1, > + c.frac0, c.frac1, 0, status); > + } > + > + /* a & b should be normals now... */ > + assert(a.cls == float_class_normal && b.cls == float_class_normal); > + > + /* Multiply of 2 113-bit numbers produces a 226-bit result. */ > + mul128To256(a.frac0, a.frac1, b.frac0, b.frac1, > + &p_frac.w[0], &p_frac.w[1], &p_frac.w[2], &p_frac.w[3]); > + > + /* Realign the binary point at bit 48 of p_frac[0]. */ > + shift = clz64(p_frac.w[0]) - 15; > + shortShift256Left(&p_frac, shift); > + p_exp = a.exp + b.exp - (shift - 16); > + exp_diff = p_exp - c.exp; > + > + /* Extend the fraction part of the addend to 256 bits. */ > + c_frac.w[0] = c.frac0; > + c_frac.w[1] = c.frac1; > + c_frac.w[2] = 0; > + c_frac.w[3] = 0; > + > + /* Add or subtract C from the intermediate product. */ > + if (c.cls == float_class_zero) { > + /* Fall through to rounding after addition (with zero). */ > + } else if (p_sign != c.sign) { > + /* Subtraction */ > + if (exp_diff < 0) { > + shift256RightJamming(&p_frac, -exp_diff); > + sub256(&p_frac, &c_frac, &p_frac); > + p_exp = c.exp; > + p_sign ^= 1; > + } else if (exp_diff > 0) { > + shift256RightJamming(&c_frac, exp_diff); > + sub256(&p_frac, &p_frac, &c_frac); > + } else { > + /* Low 128 bits of C are known to be zero. */ > + sub128(p_frac.w[0], p_frac.w[1], c_frac.w[0], c_frac.w[1], > + &p_frac.w[0], &p_frac.w[1]); > + /* > + * Since we have normalized to bit 48 of p_frac[0], > + * a negative result means C > P and we need to invert. > + */ > + if ((int64_t)p_frac.w[0] < 0) { > + neg256(&p_frac); > + p_sign ^= 1; > + } > + } > + > + /* > + * Gross normalization of the 256-bit subtraction result. > + * Fine tuning below shared with addition. > + */ > + if (p_frac.w[0] != 0) { > + /* nothing to do */ > + } else if (p_frac.w[1] != 0) { > + p_exp -= 64; > + p_frac.w[0] = p_frac.w[1]; > + p_frac.w[1] = p_frac.w[2]; > + p_frac.w[2] = p_frac.w[3]; > + p_frac.w[3] = 0; > + } else if (p_frac.w[2] != 0) { > + p_exp -= 128; > + p_frac.w[0] = p_frac.w[2]; > + p_frac.w[1] = p_frac.w[3]; > + p_frac.w[2] = 0; > + p_frac.w[3] = 0; > + } else if (p_frac.w[3] != 0) { > + p_exp -= 192; > + p_frac.w[0] = p_frac.w[3]; > + p_frac.w[1] = 0; > + p_frac.w[2] = 0; > + p_frac.w[3] = 0; > + } else { > + /* Subtraction was exact: result is zero. */ > + p_sign = status->float_rounding_mode == float_round_down; > + return packFloat128(p_sign ^ sign_flip, 0, 0, 0); > + } > + } else { > + /* Addition */ > + if (exp_diff <= 0) { > + shift256RightJamming(&p_frac, -exp_diff); > + /* Low 128 bits of C are known to be zero. */ > + add128(p_frac.w[0], p_frac.w[1], c_frac.w[0], c_frac.w[1], > + &p_frac.w[0], &p_frac.w[1]); > + p_exp = c.exp; > + } else { > + shift256RightJamming(&c_frac, exp_diff); > + add256(&p_frac, &c_frac); > + } > + } > + > + /* Fine normalization of the 256-bit result: p_frac[0] != 0. */ > + shift = clz64(p_frac.w[0]) - 15; > + if (shift < 0) { > + shift256RightJamming(&p_frac, -shift); > + } else if (shift > 0) { > + shortShift256Left(&p_frac, shift); > + } > + p_exp -= shift; > + > + if (flags & float_muladd_halve_result) { > + p_exp -= 1; > + } > + return roundAndPackFloat128(p_sign ^ sign_flip, > + p_exp + 0x3fff - 1, > + p_frac.w[0], p_frac.w[1], > + p_frac.w[2] | (p_frac.w[3] != 0), > + status); > +} > + > /*---------------------------------------------------------------------------- > | Returns the result of dividing the quadruple-precision floating-point value > | `a' by the corresponding value `b'. The operation is performed according to > diff --git a/tests/fp/fp-test.c b/tests/fp/fp-test.c > index 06ffebd6db..9bbb0dba67 100644 > --- a/tests/fp/fp-test.c > +++ b/tests/fp/fp-test.c > @@ -717,7 +717,7 @@ static void do_testfloat(int op, int rmode, bool exact) > test_abz_f128(true_abz_f128M, subj_abz_f128M); > break; > case F128_MULADD: > - not_implemented(); > + test_abcz_f128(slow_f128M_mulAdd, qemu_f128_mulAdd); > break; > case F128_SQRT: > test_az_f128(slow_f128M_sqrt, qemu_f128M_sqrt); > diff --git a/tests/fp/wrap.c.inc b/tests/fp/wrap.c.inc > index 0cbd20013e..65a713deae 100644 > --- a/tests/fp/wrap.c.inc > +++ b/tests/fp/wrap.c.inc > @@ -574,6 +574,18 @@ WRAP_MULADD(qemu_f32_mulAdd, float32_muladd, float32) > WRAP_MULADD(qemu_f64_mulAdd, float64_muladd, float64) > #undef WRAP_MULADD > > +static void qemu_f128_mulAdd(const float128_t *ap, const float128_t *bp, > + const float128_t *cp, float128_t *res) > +{ > + float128 a, b, c, ret; > + > + a = soft_to_qemu128(*ap); > + b = soft_to_qemu128(*bp); > + c = soft_to_qemu128(*cp); > + ret = float128_muladd(a, b, c, 0, &qsf); > + *res = qemu_to_soft128(ret); > +} > + > #define WRAP_CMP16(name, func, retcond) \ > static bool name(float16_t a, float16_t b) \ > { \ -- Alex Bennée
On 10/16/20 9:31 AM, Alex Bennée wrote: >> +static void float128_unpack(FloatParts128 *p, float128 a, float_status *status) >> +{ >> + p->sign = extractFloat128Sign(a); >> + p->exp = extractFloat128Exp(a); >> + p->frac0 = extractFloat128Frac0(a); >> + p->frac1 = extractFloat128Frac1(a); > > Here we are deviating from the exiting style, it would be nice if we > could separate the raw unpack and have something like: > > static const FloatFmt float128_params = { > FLOAT_PARAMS(15, 112) > } > > static inline FloatParts128 unpack128_raw(FloatFmt fmt, uint128_t raw) > { > const int sign_pos = fmt.frac_size + fmt.exp_size; > > return (FloatParts128) { > .cls = float_class_unclassified, > .sign = extract128(raw, sign_pos, 1), > .exp = extract128(raw, fmt.frac_size, fmt.exp_size), > .frac1 = extract128_lo(raw, 0, fmt.frac_size), > .frac2 = extract128_hi(raw, 0, fmt.frac_size), > }; > } > > So even if we end up duplicating a chunk of the code the form will be > similar so when we side-by-side the logic we can see it works the same > way. I suppose, but unlike the smaller fp formats, we won't be able to reuse this. Even if we pull in the x86 80-bit format and the m68k 96-bit format, there are a number of fundamental differences. E.g. the implicit bit >> + /* Add the implicit bit. */ >> + p->frac0 |= UINT64_C(0x0001000000000000); is not present in the x86 and m68k formats. Finally, I'm continuing to use the existing Berkeley packing logic. Which a bit persnickety with where that implicit bit goes. Our smaller formats put the implicit bit at bit 62. r~
diff --git a/include/fpu/softfloat.h b/include/fpu/softfloat.h index 78ad5ca738..a38433deb4 100644 --- a/include/fpu/softfloat.h +++ b/include/fpu/softfloat.h @@ -1196,6 +1196,8 @@ float128 float128_sub(float128, float128, float_status *status); float128 float128_mul(float128, float128, float_status *status); float128 float128_div(float128, float128, float_status *status); float128 float128_rem(float128, float128, float_status *status); +float128 float128_muladd(float128, float128, float128, int, + float_status *status); float128 float128_sqrt(float128, float_status *status); FloatRelation float128_compare(float128, float128, float_status *status); FloatRelation float128_compare_quiet(float128, float128, float_status *status); diff --git a/fpu/softfloat.c b/fpu/softfloat.c index e038434a07..49de31fec2 100644 --- a/fpu/softfloat.c +++ b/fpu/softfloat.c @@ -512,11 +512,19 @@ static inline __attribute__((unused)) bool is_qnan(FloatClass c) typedef struct { uint64_t frac; - int32_t exp; + int32_t exp; FloatClass cls; bool sign; } FloatParts; +/* Similar for float128. */ +typedef struct { + uint64_t frac0, frac1; + int32_t exp; + FloatClass cls; + bool sign; +} FloatParts128; + #define DECOMPOSED_BINARY_POINT (64 - 2) #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT) #define DECOMPOSED_OVERFLOW_BIT (DECOMPOSED_IMPLICIT_BIT << 1) @@ -4574,6 +4582,46 @@ static void } +/*---------------------------------------------------------------------------- +| Returns the parts of floating-point value `a'. +*----------------------------------------------------------------------------*/ + +static void float128_unpack(FloatParts128 *p, float128 a, float_status *status) +{ + p->sign = extractFloat128Sign(a); + p->exp = extractFloat128Exp(a); + p->frac0 = extractFloat128Frac0(a); + p->frac1 = extractFloat128Frac1(a); + + if (p->exp == 0) { + if ((p->frac0 | p->frac1) == 0) { + p->cls = float_class_zero; + } else if (status->flush_inputs_to_zero) { + float_raise(float_flag_input_denormal, status); + p->cls = float_class_zero; + p->frac0 = p->frac1 = 0; + } else { + normalizeFloat128Subnormal(p->frac0, p->frac1, &p->exp, + &p->frac0, &p->frac1); + p->exp -= 0x3fff; + p->cls = float_class_normal; + } + } else if (p->exp == 0x7fff) { + if ((p->frac0 | p->frac1) == 0) { + p->cls = float_class_inf; + } else if (float128_is_signaling_nan(a, status)) { + p->cls = float_class_snan; + } else { + p->cls = float_class_qnan; + } + } else { + /* Add the implicit bit. */ + p->frac0 |= UINT64_C(0x0001000000000000); + p->exp -= 0x3fff; + p->cls = float_class_normal; + } +} + /*---------------------------------------------------------------------------- | Packs the sign `zSign', the exponent `zExp', and the significand formed | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision @@ -7205,6 +7253,372 @@ float128 float128_mul(float128 a, float128 b, float_status *status) } +typedef struct UInt256 { + /* Indexed big-endian, to match the rest of softfloat numbering. */ + uint64_t w[4]; +} UInt256; + +static inline uint64_t shl_double(uint64_t h, uint64_t l, unsigned lsh) +{ + return lsh ? (h << lsh) | (l >> (64 - lsh)) : h; +} + +static inline uint64_t shr_double(uint64_t h, uint64_t l, unsigned rsh) +{ + return rsh ? (h << (64 - rsh)) | (l >> rsh) : l; +} + +static void shortShift256Left(UInt256 *p, unsigned lsh) +{ + if (lsh != 0) { + p->w[0] = shl_double(p->w[0], p->w[1], lsh); + p->w[1] = shl_double(p->w[1], p->w[2], lsh); + p->w[2] = shl_double(p->w[2], p->w[3], lsh); + p->w[3] <<= lsh; + } +} + +static inline void shift256RightJamming(UInt256 *p, unsigned count) +{ + uint64_t out, p0, p1, p2, p3; + + p0 = p->w[0]; + p1 = p->w[1]; + p2 = p->w[2]; + p3 = p->w[3]; + + if (unlikely(count == 0)) { + return; + } else if (likely(count < 64)) { + out = 0; + } else if (likely(count < 256)) { + if (count < 128) { + out = p3; + p3 = p2; + p2 = p1; + p1 = p0; + p0 = 0; + } else if (count < 192) { + out = p2 | p3; + p3 = p1; + p2 = p0; + p1 = 0; + p0 = 0; + } else { + out = p1 | p2 | p3; + p3 = p0; + p2 = 0; + p1 = 0; + p0 = 0; + } + count &= 63; + if (count == 0) { + goto done; + } + } else { + out = p0 | p1 | p2 | p3; + p3 = 0; + p2 = 0; + p1 = 0; + p0 = 0; + goto done; + } + + out |= shr_double(p3, 0, count); + p3 = shr_double(p2, p3, count); + p2 = shr_double(p1, p2, count); + p1 = shr_double(p0, p1, count); + p0 = p0 >> count; + + done: + p->w[3] = p3 | (out != 0); + p->w[2] = p2; + p->w[1] = p1; + p->w[0] = p0; +} + +/* R = A - B */ +static void sub256(UInt256 *r, UInt256 *a, UInt256 *b) +{ + bool borrow = false; + + for (int i = 3; i >= 0; --i) { + uint64_t at = a->w[i]; + uint64_t bt = b->w[i]; + uint64_t rt = at - bt; + + if (borrow) { + borrow = at <= bt; + rt -= 1; + } else { + borrow = at < bt; + } + r->w[i] = rt; + } +} + +/* A = -A */ +static void neg256(UInt256 *a) +{ + /* + * Recall that -X - 1 = ~X, and that since this is negation, + * once we find a non-zero number, all subsequent words will + * have borrow-in, and thus use NOT. + */ + uint64_t t = a->w[3]; + if (likely(t)) { + a->w[3] = -t; + goto not2; + } + t = a->w[2]; + if (likely(t)) { + a->w[2] = -t; + goto not1; + } + t = a->w[1]; + if (likely(t)) { + a->w[1] = -t; + goto not0; + } + a->w[0] = -a->w[0]; + return; + not2: + a->w[2] = ~a->w[2]; + not1: + a->w[1] = ~a->w[1]; + not0: + a->w[0] = ~a->w[0]; +} + +/* A += B */ +static void add256(UInt256 *a, UInt256 *b) +{ + bool carry = false; + + for (int i = 3; i >= 0; --i) { + uint64_t bt = b->w[i]; + uint64_t at = a->w[i] + bt; + + if (carry) { + at += 1; + carry = at <= bt; + } else { + carry = at < bt; + } + a->w[i] = at; + } +} + +float128 float128_muladd(float128 a_f, float128 b_f, float128 c_f, + int flags, float_status *status) +{ + bool inf_zero, p_sign, sign_flip; + int p_exp, exp_diff, shift, ab_mask, abc_mask; + FloatParts128 a, b, c; + FloatClass p_cls; + UInt256 p_frac, c_frac; + + float128_unpack(&a, a_f, status); + float128_unpack(&b, b_f, status); + float128_unpack(&c, c_f, status); + + ab_mask = float_cmask(a.cls) | float_cmask(b.cls); + abc_mask = float_cmask(c.cls) | ab_mask; + inf_zero = ab_mask == float_cmask_infzero; + + /* If any input is a NaN, select the required result. */ + if (unlikely(abc_mask & float_cmask_anynan)) { + if (unlikely(abc_mask & float_cmask_snan)) { + float_raise(float_flag_invalid, status); + } + + int which = pickNaNMulAdd(a.cls, b.cls, c.cls, inf_zero, status); + if (status->default_nan_mode) { + which = 3; + } + switch (which) { + case 0: + break; + case 1: + a_f = b_f; + a.cls = b.cls; + break; + case 2: + a_f = c_f; + a.cls = c.cls; + break; + case 3: + return float128_default_nan(status); + } + if (is_snan(a.cls)) { + return float128_silence_nan(a_f, status); + } + return a_f; + } + + /* After dealing with input NaNs, look for Inf * Zero. */ + if (unlikely(inf_zero)) { + float_raise(float_flag_invalid, status); + return float128_default_nan(status); + } + + p_sign = a.sign ^ b.sign; + + if (flags & float_muladd_negate_c) { + c.sign ^= 1; + } + if (flags & float_muladd_negate_product) { + p_sign ^= 1; + } + sign_flip = (flags & float_muladd_negate_result); + + if (ab_mask & float_cmask_inf) { + p_cls = float_class_inf; + } else if (ab_mask & float_cmask_zero) { + p_cls = float_class_zero; + } else { + p_cls = float_class_normal; + } + + if (c.cls == float_class_inf) { + if (p_cls == float_class_inf && p_sign != c.sign) { + /* +Inf + -Inf = NaN */ + float_raise(float_flag_invalid, status); + return float128_default_nan(status); + } + /* Inf + Inf = Inf of the proper sign; reuse the return below. */ + p_cls = float_class_inf; + p_sign = c.sign; + } + + if (p_cls == float_class_inf) { + return packFloat128(p_sign ^ sign_flip, 0x7fff, 0, 0); + } + + if (p_cls == float_class_zero) { + if (c.cls == float_class_zero) { + if (p_sign != c.sign) { + p_sign = status->float_rounding_mode == float_round_down; + } + return packFloat128(p_sign ^ sign_flip, 0, 0, 0); + } + + if (flags & float_muladd_halve_result) { + c.exp -= 1; + } + return roundAndPackFloat128(c.sign ^ sign_flip, + c.exp + 0x3fff - 1, + c.frac0, c.frac1, 0, status); + } + + /* a & b should be normals now... */ + assert(a.cls == float_class_normal && b.cls == float_class_normal); + + /* Multiply of 2 113-bit numbers produces a 226-bit result. */ + mul128To256(a.frac0, a.frac1, b.frac0, b.frac1, + &p_frac.w[0], &p_frac.w[1], &p_frac.w[2], &p_frac.w[3]); + + /* Realign the binary point at bit 48 of p_frac[0]. */ + shift = clz64(p_frac.w[0]) - 15; + shortShift256Left(&p_frac, shift); + p_exp = a.exp + b.exp - (shift - 16); + exp_diff = p_exp - c.exp; + + /* Extend the fraction part of the addend to 256 bits. */ + c_frac.w[0] = c.frac0; + c_frac.w[1] = c.frac1; + c_frac.w[2] = 0; + c_frac.w[3] = 0; + + /* Add or subtract C from the intermediate product. */ + if (c.cls == float_class_zero) { + /* Fall through to rounding after addition (with zero). */ + } else if (p_sign != c.sign) { + /* Subtraction */ + if (exp_diff < 0) { + shift256RightJamming(&p_frac, -exp_diff); + sub256(&p_frac, &c_frac, &p_frac); + p_exp = c.exp; + p_sign ^= 1; + } else if (exp_diff > 0) { + shift256RightJamming(&c_frac, exp_diff); + sub256(&p_frac, &p_frac, &c_frac); + } else { + /* Low 128 bits of C are known to be zero. */ + sub128(p_frac.w[0], p_frac.w[1], c_frac.w[0], c_frac.w[1], + &p_frac.w[0], &p_frac.w[1]); + /* + * Since we have normalized to bit 48 of p_frac[0], + * a negative result means C > P and we need to invert. + */ + if ((int64_t)p_frac.w[0] < 0) { + neg256(&p_frac); + p_sign ^= 1; + } + } + + /* + * Gross normalization of the 256-bit subtraction result. + * Fine tuning below shared with addition. + */ + if (p_frac.w[0] != 0) { + /* nothing to do */ + } else if (p_frac.w[1] != 0) { + p_exp -= 64; + p_frac.w[0] = p_frac.w[1]; + p_frac.w[1] = p_frac.w[2]; + p_frac.w[2] = p_frac.w[3]; + p_frac.w[3] = 0; + } else if (p_frac.w[2] != 0) { + p_exp -= 128; + p_frac.w[0] = p_frac.w[2]; + p_frac.w[1] = p_frac.w[3]; + p_frac.w[2] = 0; + p_frac.w[3] = 0; + } else if (p_frac.w[3] != 0) { + p_exp -= 192; + p_frac.w[0] = p_frac.w[3]; + p_frac.w[1] = 0; + p_frac.w[2] = 0; + p_frac.w[3] = 0; + } else { + /* Subtraction was exact: result is zero. */ + p_sign = status->float_rounding_mode == float_round_down; + return packFloat128(p_sign ^ sign_flip, 0, 0, 0); + } + } else { + /* Addition */ + if (exp_diff <= 0) { + shift256RightJamming(&p_frac, -exp_diff); + /* Low 128 bits of C are known to be zero. */ + add128(p_frac.w[0], p_frac.w[1], c_frac.w[0], c_frac.w[1], + &p_frac.w[0], &p_frac.w[1]); + p_exp = c.exp; + } else { + shift256RightJamming(&c_frac, exp_diff); + add256(&p_frac, &c_frac); + } + } + + /* Fine normalization of the 256-bit result: p_frac[0] != 0. */ + shift = clz64(p_frac.w[0]) - 15; + if (shift < 0) { + shift256RightJamming(&p_frac, -shift); + } else if (shift > 0) { + shortShift256Left(&p_frac, shift); + } + p_exp -= shift; + + if (flags & float_muladd_halve_result) { + p_exp -= 1; + } + return roundAndPackFloat128(p_sign ^ sign_flip, + p_exp + 0x3fff - 1, + p_frac.w[0], p_frac.w[1], + p_frac.w[2] | (p_frac.w[3] != 0), + status); +} + /*---------------------------------------------------------------------------- | Returns the result of dividing the quadruple-precision floating-point value | `a' by the corresponding value `b'. The operation is performed according to diff --git a/tests/fp/fp-test.c b/tests/fp/fp-test.c index 06ffebd6db..9bbb0dba67 100644 --- a/tests/fp/fp-test.c +++ b/tests/fp/fp-test.c @@ -717,7 +717,7 @@ static void do_testfloat(int op, int rmode, bool exact) test_abz_f128(true_abz_f128M, subj_abz_f128M); break; case F128_MULADD: - not_implemented(); + test_abcz_f128(slow_f128M_mulAdd, qemu_f128_mulAdd); break; case F128_SQRT: test_az_f128(slow_f128M_sqrt, qemu_f128M_sqrt); diff --git a/tests/fp/wrap.c.inc b/tests/fp/wrap.c.inc index 0cbd20013e..65a713deae 100644 --- a/tests/fp/wrap.c.inc +++ b/tests/fp/wrap.c.inc @@ -574,6 +574,18 @@ WRAP_MULADD(qemu_f32_mulAdd, float32_muladd, float32) WRAP_MULADD(qemu_f64_mulAdd, float64_muladd, float64) #undef WRAP_MULADD +static void qemu_f128_mulAdd(const float128_t *ap, const float128_t *bp, + const float128_t *cp, float128_t *res) +{ + float128 a, b, c, ret; + + a = soft_to_qemu128(*ap); + b = soft_to_qemu128(*bp); + c = soft_to_qemu128(*cp); + ret = float128_muladd(a, b, c, 0, &qsf); + *res = qemu_to_soft128(ret); +} + #define WRAP_CMP16(name, func, retcond) \ static bool name(float16_t a, float16_t b) \ { \
Signed-off-by: Richard Henderson <richard.henderson@linaro.org> --- include/fpu/softfloat.h | 2 + fpu/softfloat.c | 416 +++++++++++++++++++++++++++++++++++++++- tests/fp/fp-test.c | 2 +- tests/fp/wrap.c.inc | 12 ++ 4 files changed, 430 insertions(+), 2 deletions(-)