Message ID | 20210527041405.391567-4-richard.henderson@linaro.org |
---|---|
State | New |
Headers | show |
Series | softfloat: Improve denormal handling | expand |
I'm probably missing something, but why do we need both "float_flag_inorm_denormal" and "float_flag_iflush_denormal"? Couldn't the code that sets these flags set just a single flag for all denormal inputs and the code that checks these flags check that single flag combined with the "flush_inputs_to_zero" flag to accomplish what the two separate "input denormal" flags do? Michael -----Original Message----- From: Richard Henderson <richard.henderson@linaro.org> Sent: Wednesday, May 26, 2021 9:14 PM To: qemu-devel@nongnu.org Cc: Michael Morrell <mmorrell@tachyum.com>; alex.bennee@linaro.org Subject: [PATCH 03/11] softfloat: Introduce float_flag_inorm_denormal Create a new exception flag for reporting input denormals that are not flushed to zero, they are normalized and treated as normal numbers. Signed-off-by: Richard Henderson <richard.henderson@linaro.org> --- include/fpu/softfloat-types.h | 15 ++++--- fpu/softfloat.c | 84 +++++++++++------------------------ fpu/softfloat-parts.c.inc | 1 + 3 files changed, 36 insertions(+), 64 deletions(-) diff --git a/include/fpu/softfloat-types.h b/include/fpu/softfloat-types.h index e2d70ff556..174100e50e 100644 --- a/include/fpu/softfloat-types.h +++ b/include/fpu/softfloat-types.h @@ -143,13 +143,14 @@ typedef enum __attribute__((__packed__)) { */ enum { - float_flag_invalid = 1, - float_flag_divbyzero = 4, - float_flag_overflow = 8, - float_flag_underflow = 16, - float_flag_inexact = 32, - float_flag_iflush_denormal = 64, - float_flag_oflush_denormal = 128 + float_flag_invalid = 0x0001, + float_flag_divbyzero = 0x0002, + float_flag_overflow = 0x0004, + float_flag_underflow = 0x0008, + float_flag_inexact = 0x0010, + float_flag_inorm_denormal = 0x0020, /* denormal input, normalized */ + float_flag_iflush_denormal = 0x0040, /* denormal input, flushed to zero */ + float_flag_oflush_denormal = 0x0080, /* denormal result, flushed + to zero */ }; /* diff --git a/fpu/softfloat.c b/fpu/softfloat.c index cb077cf111..e54cdb274d 100644 --- a/fpu/softfloat.c +++ b/fpu/softfloat.c @@ -126,61 +126,23 @@ this code that are retained. * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result * and the result is < the minimum normal. */ -#define GEN_INPUT_FLUSH__NOCHECK(name, soft_t) \ + +#define GEN_INPUT_FLUSH(name, soft_t) \ static inline void name(soft_t *a, float_status *s) \ { \ if (unlikely(soft_t ## _is_denormal(*a))) { \ - *a = soft_t ## _set_sign(soft_t ## _zero, \ - soft_t ## _is_neg(*a)); \ - float_raise(float_flag_iflush_denormal, s); \ + if (s->flush_inputs_to_zero) { \ + *a = soft_t ## _set_sign(0, soft_t ## _is_neg(*a)); \ + float_raise(float_flag_iflush_denormal, s); \ + } else { \ + float_raise(float_flag_inorm_denormal, s); \ + } \ } \ } -GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck, float32) -GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck, float64) -#undef GEN_INPUT_FLUSH__NOCHECK - -#define GEN_INPUT_FLUSH1(name, soft_t) \ - static inline void name(soft_t *a, float_status *s) \ - { \ - if (likely(!s->flush_inputs_to_zero)) { \ - return; \ - } \ - soft_t ## _input_flush__nocheck(a, s); \ - } - -GEN_INPUT_FLUSH1(float32_input_flush1, float32) -GEN_INPUT_FLUSH1(float64_input_flush1, float64) -#undef GEN_INPUT_FLUSH1 - -#define GEN_INPUT_FLUSH2(name, soft_t) \ - static inline void name(soft_t *a, soft_t *b, float_status *s) \ - { \ - if (likely(!s->flush_inputs_to_zero)) { \ - return; \ - } \ - soft_t ## _input_flush__nocheck(a, s); \ - soft_t ## _input_flush__nocheck(b, s); \ - } - -GEN_INPUT_FLUSH2(float32_input_flush2, float32) -GEN_INPUT_FLUSH2(float64_input_flush2, float64) -#undef GEN_INPUT_FLUSH2 - -#define GEN_INPUT_FLUSH3(name, soft_t) \ - static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \ - { \ - if (likely(!s->flush_inputs_to_zero)) { \ - return; \ - } \ - soft_t ## _input_flush__nocheck(a, s); \ - soft_t ## _input_flush__nocheck(b, s); \ - soft_t ## _input_flush__nocheck(c, s); \ - } - -GEN_INPUT_FLUSH3(float32_input_flush3, float32) -GEN_INPUT_FLUSH3(float64_input_flush3, float64) -#undef GEN_INPUT_FLUSH3 +GEN_INPUT_FLUSH(float32_input_flush, float32) +GEN_INPUT_FLUSH(float64_input_flush, float64) #undef GEN_INPUT_FLUSH /* * Choose whether to use fpclassify or float32/64_* primitives in the generated @@ -353,7 +315,8 @@ float32_gen2(float32 xa, float32 xb, float_status *s, goto soft; } - float32_input_flush2(&ua.s, &ub.s, s); + float32_input_flush(&ua.s, s); + float32_input_flush(&ub.s, s); if (unlikely(!pre(ua, ub))) { goto soft; } @@ -384,7 +347,8 @@ float64_gen2(float64 xa, float64 xb, float_status *s, goto soft; } - float64_input_flush2(&ua.s, &ub.s, s); + float64_input_flush(&ua.s, s); + float64_input_flush(&ub.s, s); if (unlikely(!pre(ua, ub))) { goto soft; } @@ -2161,7 +2125,9 @@ float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s) goto soft; } - float32_input_flush3(&ua.s, &ub.s, &uc.s, s); + float32_input_flush(&ua.s, s); + float32_input_flush(&ub.s, s); + float32_input_flush(&uc.s, s); if (unlikely(!f32_is_zon3(ua, ub, uc))) { goto soft; } @@ -2232,7 +2198,9 @@ float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s) goto soft; } - float64_input_flush3(&ua.s, &ub.s, &uc.s, s); + float64_input_flush(&ua.s, s); + float64_input_flush(&ub.s, s); + float64_input_flush(&uc.s, s); if (unlikely(!f64_is_zon3(ua, ub, uc))) { goto soft; } @@ -3988,7 +3956,8 @@ float32_hs_compare(float32 xa, float32 xb, float_status *s, bool is_quiet) goto soft; } - float32_input_flush2(&ua.s, &ub.s, s); + float32_input_flush(&ua.s, s); + float32_input_flush(&ub.s, s); if (isgreaterequal(ua.h, ub.h)) { if (isgreater(ua.h, ub.h)) { return float_relation_greater; @@ -4038,7 +4007,8 @@ float64_hs_compare(float64 xa, float64 xb, float_status *s, bool is_quiet) goto soft; } - float64_input_flush2(&ua.s, &ub.s, s); + float64_input_flush(&ua.s, s); + float64_input_flush(&ub.s, s); if (isgreaterequal(ua.h, ub.h)) { if (isgreater(ua.h, ub.h)) { return float_relation_greater; @@ -4230,7 +4200,7 @@ float32 QEMU_FLATTEN float32_sqrt(float32 xa, float_status *s) goto soft; } - float32_input_flush1(&ua.s, s); + float32_input_flush(&ua.s, s); if (QEMU_HARDFLOAT_1F32_USE_FP) { if (unlikely(!(fpclassify(ua.h) == FP_NORMAL || fpclassify(ua.h) == FP_ZERO) || @@ -4257,7 +4227,7 @@ float64 QEMU_FLATTEN float64_sqrt(float64 xa, float_status *s) goto soft; } - float64_input_flush1(&ua.s, s); + float64_input_flush(&ua.s, s); if (QEMU_HARDFLOAT_1F64_USE_FP) { if (unlikely(!(fpclassify(ua.h) == FP_NORMAL || fpclassify(ua.h) == FP_ZERO) || diff --git a/fpu/softfloat-parts.c.inc b/fpu/softfloat-parts.c.inc index 72e2b24539..16d4425419 100644 --- a/fpu/softfloat-parts.c.inc +++ b/fpu/softfloat-parts.c.inc @@ -119,6 +119,7 @@ static void partsN(canonicalize)(FloatPartsN *p, float_status *status, int shift = frac_normalize(p); p->cls = float_class_normal; p->exp = fmt->frac_shift - fmt->exp_bias - shift + 1; + float_raise(float_flag_inorm_denormal, status); } } else if (likely(p->exp < fmt->exp_max) || fmt->arm_althp) { p->cls = float_class_normal; -- 2.25.1
On 5/28/21 10:41 AM, Michael Morrell wrote: > I'm probably missing something, but why do we need both "float_flag_inorm_denormal" and "float_flag_iflush_denormal"? > > Couldn't the code that sets these flags set just a single flag for all denormal inputs and the code that checks these flags check that single flag > combined with the "flush_inputs_to_zero" flag to accomplish what the two separate "input denormal" flags do? The thing that you're missing is that many guests leave the accumulated softfloat exceptions in the float_status structure until the guest FPSCR register is read. Unless the guest needs to raise an exception immediately, there's no reason to do otherwise. With this setup, you have no temporal connection between "any denormal" and "flush-to-zero is set", thus two flags. r~
Richard Henderson <richard.henderson@linaro.org> writes: > Create a new exception flag for reporting input denormals that are not > flushed to zero, they are normalized and treated as normal numbers. > > Signed-off-by: Richard Henderson <richard.henderson@linaro.org> > --- > include/fpu/softfloat-types.h | 15 ++++--- > fpu/softfloat.c | 84 +++++++++++------------------------ > fpu/softfloat-parts.c.inc | 1 + > 3 files changed, 36 insertions(+), 64 deletions(-) > > diff --git a/include/fpu/softfloat-types.h b/include/fpu/softfloat-types.h > index e2d70ff556..174100e50e 100644 > --- a/include/fpu/softfloat-types.h > +++ b/include/fpu/softfloat-types.h > @@ -143,13 +143,14 @@ typedef enum __attribute__((__packed__)) { > */ > > enum { > - float_flag_invalid = 1, > - float_flag_divbyzero = 4, > - float_flag_overflow = 8, > - float_flag_underflow = 16, > - float_flag_inexact = 32, > - float_flag_iflush_denormal = 64, > - float_flag_oflush_denormal = 128 > + float_flag_invalid = 0x0001, > + float_flag_divbyzero = 0x0002, > + float_flag_overflow = 0x0004, > + float_flag_underflow = 0x0008, > + float_flag_inexact = 0x0010, > + float_flag_inorm_denormal = 0x0020, /* denormal input, normalized */ > + float_flag_iflush_denormal = 0x0040, /* denormal input, flushed to zero */ > + float_flag_oflush_denormal = 0x0080, /* denormal result, flushed to zero */ > }; > > /* > diff --git a/fpu/softfloat.c b/fpu/softfloat.c > index cb077cf111..e54cdb274d 100644 > --- a/fpu/softfloat.c > +++ b/fpu/softfloat.c > @@ -126,61 +126,23 @@ this code that are retained. > * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result > * and the result is < the minimum normal. > */ > -#define GEN_INPUT_FLUSH__NOCHECK(name, soft_t) \ > + > +#define GEN_INPUT_FLUSH(name, soft_t) \ > static inline void name(soft_t *a, float_status *s) \ > { \ > if (unlikely(soft_t ## _is_denormal(*a))) { \ > - *a = soft_t ## _set_sign(soft_t ## _zero, \ > - soft_t ## _is_neg(*a)); \ > - float_raise(float_flag_iflush_denormal, s); \ > + if (s->flush_inputs_to_zero) { \ > + *a = soft_t ## _set_sign(0, soft_t ## _is_neg(*a)); \ > + float_raise(float_flag_iflush_denormal, s); \ > + } else { \ > + float_raise(float_flag_inorm_denormal, s); \ > + } \ > } \ > } So I'm guessing Emilio had the original flush code split was to avoid multiple checks against s->flush_inputs_to_zero in the code. The was possibly a good reason, comparing the before/after of float32_mul: Dump of assembler code for function float32_mul: 0x0000000000934240 <+0>: movzbl 0x1(%rdx),%eax 0x0000000000934244 <+4>: test $0x20,%al 0x0000000000934246 <+6>: je 0x9342b0 <float32_mul+112> 0x0000000000934248 <+8>: cmpb $0x0,(%rdx) 0x000000000093424b <+11>: jne 0x9342b0 <float32_mul+112> 0x000000000093424d <+13>: cmpb $0x0,0x5(%rdx) 0x0000000000934251 <+17>: jne 0x9342d0 <float32_mul+144> 0x0000000000934253 <+19>: mov %edi,%eax 0x0000000000934255 <+21>: shr $0x17,%eax 0x0000000000934258 <+24>: add $0x1,%eax 0x000000000093425b <+27>: test $0xfe,%al 0x000000000093425d <+29>: je 0x9342a8 <float32_mul+104> 0x000000000093425f <+31>: mov %esi,%eax 0x0000000000934261 <+33>: shr $0x17,%eax 0x0000000000934264 <+36>: add $0x1,%eax 0x0000000000934267 <+39>: test $0xfe,%al 0x0000000000934269 <+41>: jne 0x934273 <float32_mul+51> 0x000000000093426b <+43>: test $0x7fffffff,%esi 0x0000000000934271 <+49>: jne 0x9342b0 <float32_mul+112> 0x0000000000934273 <+51>: mov %esi,-0xc(%rsp) 0x0000000000934277 <+55>: movss -0xc(%rsp),%xmm0 0x000000000093427d <+61>: mov %edi,-0xc(%rsp) 0x0000000000934281 <+65>: movss -0xc(%rsp),%xmm2 0x0000000000934287 <+71>: mulss %xmm2,%xmm0 0x000000000093428b <+75>: movd %xmm0,%eax 0x000000000093428f <+79>: andps 0x3b805a(%rip),%xmm0 # 0xcec2f0 0x0000000000934296 <+86>: ucomiss 0x3b8047(%rip),%xmm0 # 0xcec2e4 0x000000000093429d <+93>: jbe 0x9342b8 <float32_mul+120> 0x000000000093429f <+95>: orb $0x8,0x1(%rdx) 0x00000000009342a3 <+99>: retq 0x00000000009342a4 <+100>: nopl 0x0(%rax) 0x00000000009342a8 <+104>: test $0x7fffffff,%edi 0x00000000009342ae <+110>: je 0x93425f <float32_mul+31> 0x00000000009342b0 <+112>: jmpq 0x9290d0 <soft_f32_mul> 0x00000000009342b5 <+117>: nopl (%rax) 0x00000000009342b8 <+120>: movss 0x3b8020(%rip),%xmm1 # 0xcec2e0 0x00000000009342c0 <+128>: comiss %xmm0,%xmm1 0x00000000009342c3 <+131>: jae 0x934320 <float32_mul+224> 0x00000000009342c5 <+133>: retq 0x00000000009342c6 <+134>: nopw %cs:0x0(%rax,%rax,1) 0x00000000009342d0 <+144>: test $0x7f800000,%edi 0x00000000009342d6 <+150>: jne 0x9342f0 <float32_mul+176> 0x00000000009342d8 <+152>: test $0x7fffffff,%edi 0x00000000009342de <+158>: je 0x9342f0 <float32_mul+176> 0x00000000009342e0 <+160>: or $0x40,%eax 0x00000000009342e3 <+163>: and $0x80000000,%edi 0x00000000009342e9 <+169>: mov %al,0x1(%rdx) 0x00000000009342ec <+172>: nopl 0x0(%rax) 0x00000000009342f0 <+176>: test $0x7f800000,%esi 0x00000000009342f6 <+182>: jne 0x934253 <float32_mul+19> 0x00000000009342fc <+188>: test $0x7fffffff,%esi 0x0000000000934302 <+194>: je 0x934253 <float32_mul+19> 0x0000000000934308 <+200>: and $0x80000000,%esi 0x000000000093430e <+206>: orb $0x40,0x1(%rdx) 0x0000000000934312 <+210>: jmpq 0x934253 <float32_mul+19> 0x0000000000934317 <+215>: nopw 0x0(%rax,%rax,1) 0x0000000000934320 <+224>: mov %edi,%ecx 0x0000000000934322 <+226>: or %esi,%ecx 0x0000000000934324 <+228>: and $0x7fffffff,%ecx 0x000000000093432a <+234>: jne 0x9342b0 <float32_mul+112> 0x000000000093432c <+236>: jmp 0x9342c5 <float32_mul+133> End of assembler dump. And after this change: Dump of assembler code for function float32_mul: 0x0000000000895d60 <+0>: movzbl 0x1(%rdx),%eax 0x0000000000895d64 <+4>: test $0x10,%al 0x0000000000895d66 <+6>: je 0x895e30 <float32_mul+208> 0x0000000000895d6c <+12>: cmpb $0x0,(%rdx) 0x0000000000895d6f <+15>: jne 0x895e30 <float32_mul+208> 0x0000000000895d75 <+21>: test $0x7f800000,%edi 0x0000000000895d7b <+27>: jne 0x895da0 <float32_mul+64> 0x0000000000895d7d <+29>: test $0x7fffffff,%edi 0x0000000000895d83 <+35>: je 0x895da0 <float32_mul+64> 0x0000000000895d85 <+37>: cmpb $0x0,0x5(%rdx) 0x0000000000895d89 <+41>: je 0x895e60 <float32_mul+256> 0x0000000000895d8f <+47>: or $0x40,%eax 0x0000000000895d92 <+50>: and $0x80000000,%edi 0x0000000000895d98 <+56>: mov %al,0x1(%rdx) 0x0000000000895d9b <+59>: nopl 0x0(%rax,%rax,1) 0x0000000000895da0 <+64>: test $0x7f800000,%esi 0x0000000000895da6 <+70>: jne 0x895dd0 <float32_mul+112> 0x0000000000895da8 <+72>: test $0x7fffffff,%esi 0x0000000000895dae <+78>: je 0x895dd0 <float32_mul+112> 0x0000000000895db0 <+80>: cmpb $0x0,0x5(%rdx) 0x0000000000895db4 <+84>: movzbl 0x1(%rdx),%eax 0x0000000000895db8 <+88>: je 0x895e50 <float32_mul+240> 0x0000000000895dbe <+94>: or $0x40,%eax 0x0000000000895dc1 <+97>: and $0x80000000,%esi 0x0000000000895dc7 <+103>: mov %al,0x1(%rdx) 0x0000000000895dca <+106>: nopw 0x0(%rax,%rax,1) 0x0000000000895dd0 <+112>: mov %edi,%eax 0x0000000000895dd2 <+114>: shr $0x17,%eax 0x0000000000895dd5 <+117>: add $0x1,%eax 0x0000000000895dd8 <+120>: test $0xfe,%al 0x0000000000895dda <+122>: je 0x895e28 <float32_mul+200> 0x0000000000895ddc <+124>: mov %esi,%eax 0x0000000000895dde <+126>: shr $0x17,%eax 0x0000000000895de1 <+129>: add $0x1,%eax 0x0000000000895de4 <+132>: test $0xfe,%al 0x0000000000895de6 <+134>: jne 0x895df0 <float32_mul+144> 0x0000000000895de8 <+136>: test $0x7fffffff,%esi 0x0000000000895dee <+142>: jne 0x895e30 <float32_mul+208> 0x0000000000895df0 <+144>: mov %esi,-0xc(%rsp) 0x0000000000895df4 <+148>: movss -0xc(%rsp),%xmm0 0x0000000000895dfa <+154>: mov %edi,-0xc(%rsp) 0x0000000000895dfe <+158>: movss -0xc(%rsp),%xmm2 0x0000000000895e04 <+164>: mulss %xmm2,%xmm0 0x0000000000895e08 <+168>: movd %xmm0,%eax 0x0000000000895e0c <+172>: andps 0x46bb5d(%rip),%xmm0 # 0xd01970 0x0000000000895e13 <+179>: ucomiss 0x46bb4a(%rip),%xmm0 # 0xd01964 0x0000000000895e1a <+186>: jbe 0x895e38 <float32_mul+216> 0x0000000000895e1c <+188>: orb $0x4,0x1(%rdx) 0x0000000000895e20 <+192>: retq 0x0000000000895e21 <+193>: nopl 0x0(%rax) 0x0000000000895e28 <+200>: test $0x7fffffff,%edi 0x0000000000895e2e <+206>: je 0x895ddc <float32_mul+124> 0x0000000000895e30 <+208>: jmpq 0x88a8c0 <soft_f32_mul> 0x0000000000895e35 <+213>: nopl (%rax) 0x0000000000895e38 <+216>: movss 0x46bb20(%rip),%xmm1 # 0xd01960 0x0000000000895e40 <+224>: comiss %xmm0,%xmm1 0x0000000000895e43 <+227>: jae 0x895e70 <float32_mul+272> 0x0000000000895e45 <+229>: retq 0x0000000000895e46 <+230>: nopw %cs:0x0(%rax,%rax,1) 0x0000000000895e50 <+240>: or $0x20,%eax 0x0000000000895e53 <+243>: mov %al,0x1(%rdx) 0x0000000000895e56 <+246>: jmpq 0x895dd0 <float32_mul+112> 0x0000000000895e5b <+251>: nopl 0x0(%rax,%rax,1) 0x0000000000895e60 <+256>: or $0x20,%eax 0x0000000000895e63 <+259>: mov %al,0x1(%rdx) 0x0000000000895e66 <+262>: jmpq 0x895da0 <float32_mul+64> 0x0000000000895e6b <+267>: nopl 0x0(%rax,%rax,1) 0x0000000000895e70 <+272>: mov %esi,%ecx 0x0000000000895e72 <+274>: or %edi,%ecx 0x0000000000895e74 <+276>: and $0x7fffffff,%ecx 0x0000000000895e7a <+282>: jne 0x895e30 <float32_mul+208> 0x0000000000895e7c <+284>: jmp 0x895e45 <float32_mul+229> End of assembler dump. However I'm not sure how much of that increase is down to the change of macro expansion and how much is due to the extra leg for the flushing. Anyway other than that observation seems OK to me: Signed-off-by: Alex Bennée <alex.bennee@linaro.org> -- Alex Bennée
Alex Bennée <alex.bennee@linaro.org> writes: > Richard Henderson <richard.henderson@linaro.org> writes: > >> Create a new exception flag for reporting input denormals that are not >> flushed to zero, they are normalized and treated as normal numbers. >> >> Signed-off-by: Richard Henderson <richard.henderson@linaro.org> <snip> > > Anyway other than that observation seems OK to me: > > Signed-off-by: Alex Bennée <alex.bennee@linaro.org> I of course meant: Reviewed-by: Alex Bennée <alex.bennee@linaro.org> -- Alex Bennée
On 6/7/21 8:35 AM, Alex Bennée wrote: > So I'm guessing Emilio had the original flush code split was to avoid > multiple checks against s->flush_inputs_to_zero in the code. The was > possibly a good reason, comparing the before/after of float32_mul: I assumed that the most important thing now is that we test floatN_is_denormal only once -- the test for flush_inputs_to_zero is fairly trivial. If you've got a better ordering of operations for this, do tell. r~
Richard Henderson <richard.henderson@linaro.org> writes: > On 6/7/21 8:35 AM, Alex Bennée wrote: >> So I'm guessing Emilio had the original flush code split was to avoid >> multiple checks against s->flush_inputs_to_zero in the code. The was >> possibly a good reason, comparing the before/after of float32_mul: > > I assumed that the most important thing now is that we test > floatN_is_denormal only once -- the test for flush_inputs_to_zero is > fairly trivial. > > If you've got a better ordering of operations for this, do tell. What I really want is to know which instructions translate into the if (s->flush_inputs_to_zero) and verifying that is only checked once. Maybe I'm just suspicious of compilers ability to optimise things away... -- Alex Bennée
On 6/7/21 10:19 AM, Alex Bennée wrote: >> If you've got a better ordering of operations for this, do tell. > > What I really want is to know which instructions translate into the if > (s->flush_inputs_to_zero) and verifying that is only checked once. Maybe > I'm just suspicious of compilers ability to optimise things away... > Dump of assembler code for function float32_mul: > 0x0000000000895d60 <+0>: movzbl 0x1(%rdx),%eax > 0x0000000000895d64 <+4>: test $0x10,%al > 0x0000000000895d66 <+6>: je 0x895e30 <float32_mul+208> s->float_exception_flags & float_flag_inexact > 0x0000000000895d6c <+12>: cmpb $0x0,(%rdx) > 0x0000000000895d6f <+15>: jne 0x895e30 <float32_mul+208> s->float_rounding_mode == float_round_nearest_even > 0x0000000000895d75 <+21>: test $0x7f800000,%edi > 0x0000000000895d7b <+27>: jne 0x895da0 <float32_mul+64> > 0x0000000000895d7d <+29>: test $0x7fffffff,%edi > 0x0000000000895d83 <+35>: je 0x895da0 <float32_mul+64> float32_is_denormal > 0x0000000000895d85 <+37>: cmpb $0x0,0x5(%rdx) > 0x0000000000895d89 <+41>: je 0x895e60 <float32_mul+256> s->flush_inputs_to_zero > 0x0000000000895d8f <+47>: or $0x40,%eax > 0x0000000000895d92 <+50>: and $0x80000000,%edi > 0x0000000000895d98 <+56>: mov %al,0x1(%rdx) flush-to-zero and set iflush_denormal > 0x0000000000895da0 <+64>: test $0x7f800000,%esi > 0x0000000000895da6 <+70>: jne 0x895dd0 <float32_mul+112> > 0x0000000000895da8 <+72>: test $0x7fffffff,%esi > 0x0000000000895dae <+78>: je 0x895dd0 <float32_mul+112> float32_is_denormal (second operand) > 0x0000000000895db0 <+80>: cmpb $0x0,0x5(%rdx) > 0x0000000000895db4 <+84>: movzbl 0x1(%rdx),%eax > 0x0000000000895db8 <+88>: je 0x895e50 <float32_mul+240> > 0x0000000000895dbe <+94>: or $0x40,%eax > 0x0000000000895dc1 <+97>: and $0x80000000,%esi s->flush_inputs_to_zero, flush-to-zero, set iflush_denormal. ... > 0x0000000000895e50 <+240>: or $0x20,%eax > 0x0000000000895e53 <+243>: mov %al,0x1(%rdx) > 0x0000000000895e56 <+246>: jmpq 0x895dd0 <float32_mul+112> set inorm_denormal (second operand) > 0x0000000000895e60 <+256>: or $0x20,%eax > 0x0000000000895e63 <+259>: mov %al,0x1(%rdx) > 0x0000000000895e66 <+262>: jmpq 0x895da0 <float32_mul+64> set inorm_denormal (first operand) There do seem to be 3 reads/writes to exception_flags for float_raise. r~
Just curious. What's the expected timeline to get these denormal patches in the tree? -----Original Message----- From: Richard Henderson <richard.henderson@linaro.org> Sent: Saturday, May 29, 2021 8:21 AM To: Michael Morrell <mmorrell@tachyum.com>; qemu-devel@nongnu.org Cc: alex.bennee@linaro.org Subject: Re: [PATCH 03/11] softfloat: Introduce float_flag_inorm_denormal On 5/28/21 10:41 AM, Michael Morrell wrote: > I'm probably missing something, but why do we need both "float_flag_inorm_denormal" and "float_flag_iflush_denormal"? > > Couldn't the code that sets these flags set just a single flag for all > denormal inputs and the code that checks these flags check that single flag combined with the "flush_inputs_to_zero" flag to accomplish what the two separate "input denormal" flags do? The thing that you're missing is that many guests leave the accumulated softfloat exceptions in the float_status structure until the guest FPSCR register is read. Unless the guest needs to raise an exception immediately, there's no reason to do otherwise. With this setup, you have no temporal connection between "any denormal" and "flush-to-zero is set", thus two flags. r~
On 7/14/21 9:44 AM, Michael Morrell wrote:
> Just curious. What's the expected timeline to get these denormal patches in the tree?
Next development cycle, at minimum. I need to fix the problems vs NaNs that you identified.
r~
OK, thanks for the update. I also appreciate you looking at the NaN issue. Michael -----Original Message----- From: Richard Henderson <richard.henderson@linaro.org> Sent: Wednesday, July 14, 2021 9:57 AM To: Michael Morrell <mmorrell@tachyum.com>; qemu-devel@nongnu.org Subject: Re: [PATCH 03/11] softfloat: Introduce float_flag_inorm_denormal On 7/14/21 9:44 AM, Michael Morrell wrote: > Just curious. What's the expected timeline to get these denormal patches in the tree? Next development cycle, at minimum. I need to fix the problems vs NaNs that you identified. r~
Richard, It's been 6 months so I thought I'd check in again. Do you have an estimate of when this will go in? Michael -----Original Message----- From: Michael Morrell Sent: Wednesday, July 14, 2021 10:50 AM To: 'Richard Henderson' <richard.henderson@linaro.org>; qemu-devel@nongnu.org Subject: RE: [PATCH 03/11] softfloat: Introduce float_flag_inorm_denormal OK, thanks for the update. I also appreciate you looking at the NaN issue. Michael -----Original Message----- From: Richard Henderson <richard.henderson@linaro.org> Sent: Wednesday, July 14, 2021 9:57 AM To: Michael Morrell <mmorrell@tachyum.com>; qemu-devel@nongnu.org Subject: Re: [PATCH 03/11] softfloat: Introduce float_flag_inorm_denormal On 7/14/21 9:44 AM, Michael Morrell wrote: > Just curious. What's the expected timeline to get these denormal patches in the tree? Next development cycle, at minimum. I need to fix the problems vs NaNs that you identified. r~
diff --git a/include/fpu/softfloat-types.h b/include/fpu/softfloat-types.h index e2d70ff556..174100e50e 100644 --- a/include/fpu/softfloat-types.h +++ b/include/fpu/softfloat-types.h @@ -143,13 +143,14 @@ typedef enum __attribute__((__packed__)) { */ enum { - float_flag_invalid = 1, - float_flag_divbyzero = 4, - float_flag_overflow = 8, - float_flag_underflow = 16, - float_flag_inexact = 32, - float_flag_iflush_denormal = 64, - float_flag_oflush_denormal = 128 + float_flag_invalid = 0x0001, + float_flag_divbyzero = 0x0002, + float_flag_overflow = 0x0004, + float_flag_underflow = 0x0008, + float_flag_inexact = 0x0010, + float_flag_inorm_denormal = 0x0020, /* denormal input, normalized */ + float_flag_iflush_denormal = 0x0040, /* denormal input, flushed to zero */ + float_flag_oflush_denormal = 0x0080, /* denormal result, flushed to zero */ }; /* diff --git a/fpu/softfloat.c b/fpu/softfloat.c index cb077cf111..e54cdb274d 100644 --- a/fpu/softfloat.c +++ b/fpu/softfloat.c @@ -126,61 +126,23 @@ this code that are retained. * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result * and the result is < the minimum normal. */ -#define GEN_INPUT_FLUSH__NOCHECK(name, soft_t) \ + +#define GEN_INPUT_FLUSH(name, soft_t) \ static inline void name(soft_t *a, float_status *s) \ { \ if (unlikely(soft_t ## _is_denormal(*a))) { \ - *a = soft_t ## _set_sign(soft_t ## _zero, \ - soft_t ## _is_neg(*a)); \ - float_raise(float_flag_iflush_denormal, s); \ + if (s->flush_inputs_to_zero) { \ + *a = soft_t ## _set_sign(0, soft_t ## _is_neg(*a)); \ + float_raise(float_flag_iflush_denormal, s); \ + } else { \ + float_raise(float_flag_inorm_denormal, s); \ + } \ } \ } -GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck, float32) -GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck, float64) -#undef GEN_INPUT_FLUSH__NOCHECK - -#define GEN_INPUT_FLUSH1(name, soft_t) \ - static inline void name(soft_t *a, float_status *s) \ - { \ - if (likely(!s->flush_inputs_to_zero)) { \ - return; \ - } \ - soft_t ## _input_flush__nocheck(a, s); \ - } - -GEN_INPUT_FLUSH1(float32_input_flush1, float32) -GEN_INPUT_FLUSH1(float64_input_flush1, float64) -#undef GEN_INPUT_FLUSH1 - -#define GEN_INPUT_FLUSH2(name, soft_t) \ - static inline void name(soft_t *a, soft_t *b, float_status *s) \ - { \ - if (likely(!s->flush_inputs_to_zero)) { \ - return; \ - } \ - soft_t ## _input_flush__nocheck(a, s); \ - soft_t ## _input_flush__nocheck(b, s); \ - } - -GEN_INPUT_FLUSH2(float32_input_flush2, float32) -GEN_INPUT_FLUSH2(float64_input_flush2, float64) -#undef GEN_INPUT_FLUSH2 - -#define GEN_INPUT_FLUSH3(name, soft_t) \ - static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \ - { \ - if (likely(!s->flush_inputs_to_zero)) { \ - return; \ - } \ - soft_t ## _input_flush__nocheck(a, s); \ - soft_t ## _input_flush__nocheck(b, s); \ - soft_t ## _input_flush__nocheck(c, s); \ - } - -GEN_INPUT_FLUSH3(float32_input_flush3, float32) -GEN_INPUT_FLUSH3(float64_input_flush3, float64) -#undef GEN_INPUT_FLUSH3 +GEN_INPUT_FLUSH(float32_input_flush, float32) +GEN_INPUT_FLUSH(float64_input_flush, float64) +#undef GEN_INPUT_FLUSH /* * Choose whether to use fpclassify or float32/64_* primitives in the generated @@ -353,7 +315,8 @@ float32_gen2(float32 xa, float32 xb, float_status *s, goto soft; } - float32_input_flush2(&ua.s, &ub.s, s); + float32_input_flush(&ua.s, s); + float32_input_flush(&ub.s, s); if (unlikely(!pre(ua, ub))) { goto soft; } @@ -384,7 +347,8 @@ float64_gen2(float64 xa, float64 xb, float_status *s, goto soft; } - float64_input_flush2(&ua.s, &ub.s, s); + float64_input_flush(&ua.s, s); + float64_input_flush(&ub.s, s); if (unlikely(!pre(ua, ub))) { goto soft; } @@ -2161,7 +2125,9 @@ float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s) goto soft; } - float32_input_flush3(&ua.s, &ub.s, &uc.s, s); + float32_input_flush(&ua.s, s); + float32_input_flush(&ub.s, s); + float32_input_flush(&uc.s, s); if (unlikely(!f32_is_zon3(ua, ub, uc))) { goto soft; } @@ -2232,7 +2198,9 @@ float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s) goto soft; } - float64_input_flush3(&ua.s, &ub.s, &uc.s, s); + float64_input_flush(&ua.s, s); + float64_input_flush(&ub.s, s); + float64_input_flush(&uc.s, s); if (unlikely(!f64_is_zon3(ua, ub, uc))) { goto soft; } @@ -3988,7 +3956,8 @@ float32_hs_compare(float32 xa, float32 xb, float_status *s, bool is_quiet) goto soft; } - float32_input_flush2(&ua.s, &ub.s, s); + float32_input_flush(&ua.s, s); + float32_input_flush(&ub.s, s); if (isgreaterequal(ua.h, ub.h)) { if (isgreater(ua.h, ub.h)) { return float_relation_greater; @@ -4038,7 +4007,8 @@ float64_hs_compare(float64 xa, float64 xb, float_status *s, bool is_quiet) goto soft; } - float64_input_flush2(&ua.s, &ub.s, s); + float64_input_flush(&ua.s, s); + float64_input_flush(&ub.s, s); if (isgreaterequal(ua.h, ub.h)) { if (isgreater(ua.h, ub.h)) { return float_relation_greater; @@ -4230,7 +4200,7 @@ float32 QEMU_FLATTEN float32_sqrt(float32 xa, float_status *s) goto soft; } - float32_input_flush1(&ua.s, s); + float32_input_flush(&ua.s, s); if (QEMU_HARDFLOAT_1F32_USE_FP) { if (unlikely(!(fpclassify(ua.h) == FP_NORMAL || fpclassify(ua.h) == FP_ZERO) || @@ -4257,7 +4227,7 @@ float64 QEMU_FLATTEN float64_sqrt(float64 xa, float_status *s) goto soft; } - float64_input_flush1(&ua.s, s); + float64_input_flush(&ua.s, s); if (QEMU_HARDFLOAT_1F64_USE_FP) { if (unlikely(!(fpclassify(ua.h) == FP_NORMAL || fpclassify(ua.h) == FP_ZERO) || diff --git a/fpu/softfloat-parts.c.inc b/fpu/softfloat-parts.c.inc index 72e2b24539..16d4425419 100644 --- a/fpu/softfloat-parts.c.inc +++ b/fpu/softfloat-parts.c.inc @@ -119,6 +119,7 @@ static void partsN(canonicalize)(FloatPartsN *p, float_status *status, int shift = frac_normalize(p); p->cls = float_class_normal; p->exp = fmt->frac_shift - fmt->exp_bias - shift + 1; + float_raise(float_flag_inorm_denormal, status); } } else if (likely(p->exp < fmt->exp_max) || fmt->arm_althp) { p->cls = float_class_normal;
Create a new exception flag for reporting input denormals that are not flushed to zero, they are normalized and treated as normal numbers. Signed-off-by: Richard Henderson <richard.henderson@linaro.org> --- include/fpu/softfloat-types.h | 15 ++++--- fpu/softfloat.c | 84 +++++++++++------------------------ fpu/softfloat-parts.c.inc | 1 + 3 files changed, 36 insertions(+), 64 deletions(-) -- 2.25.1