Message ID | 20190116202349.29272-5-alex.bennee@linaro.org |
---|---|
State | New |
Headers | show |
Series | current fpu/next queue | expand |
On 1/17/19 7:23 AM, Alex Bennée wrote: > Apparently some versions of clang can't handle inline assembly with > __int128 parameters, especially on s390. Instead of hand-coding the > s390 divide provide a generic fallback for anything that provides > __int128 capable maths. > > Signed-off-by: Alex Bennée <alex.bennee@linaro.org> > Cc: Thomas Huth <thuth@redhat.com> > --- > include/fpu/softfloat-macros.h | 10 ++++------ > 1 file changed, 4 insertions(+), 6 deletions(-) > > diff --git a/include/fpu/softfloat-macros.h b/include/fpu/softfloat-macros.h > index b1d772e6d4..1a43609eef 100644 > --- a/include/fpu/softfloat-macros.h > +++ b/include/fpu/softfloat-macros.h > @@ -641,12 +641,6 @@ static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1, > uint64_t q; > asm("divq %4" : "=a"(q), "=d"(*r) : "0"(n0), "1"(n1), "rm"(d)); > return q; > -#elif defined(__s390x__) > - /* Need to use a TImode type to get an even register pair for DLGR. */ > - unsigned __int128 n = (unsigned __int128)n1 << 64 | n0; > - asm("dlgr %0, %1" : "+r"(n) : "r"(d)); > - *r = n >> 64; > - return n; > #elif defined(_ARCH_PPC64) && defined(_ARCH_PWR7) > /* From Power ISA 2.06, programming note for divdeu. */ > uint64_t q1, q2, Q, r1, r2, R; > @@ -663,6 +657,10 @@ static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1, > } > *r = R; > return Q; > +#elif defined(CONFIG_INT128) > + unsigned __int128 n = (unsigned __int128)n1 << 64 | n0; > + *r = n % d; > + return n / d; > #else I thought that we'd shown that, at least at present, no compiler is taking advantage of hardware insns for this, and is promoting this to a full 128-bit divide. And further that the version using 64-bit arithmetic was competitive with the hardware insn. I'd rather not include this hunk for now. r~
On 2019-01-16 21:23, Alex Bennée wrote: > Apparently some versions of clang can't handle inline assembly with > __int128 parameters, especially on s390. Instead of hand-coding the > s390 divide provide a generic fallback for anything that provides > __int128 capable maths. > > Signed-off-by: Alex Bennée <alex.bennee@linaro.org> > Cc: Thomas Huth <thuth@redhat.com> > --- > include/fpu/softfloat-macros.h | 10 ++++------ > 1 file changed, 4 insertions(+), 6 deletions(-) > > diff --git a/include/fpu/softfloat-macros.h b/include/fpu/softfloat-macros.h > index b1d772e6d4..1a43609eef 100644 > --- a/include/fpu/softfloat-macros.h > +++ b/include/fpu/softfloat-macros.h > @@ -641,12 +641,6 @@ static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1, > uint64_t q; > asm("divq %4" : "=a"(q), "=d"(*r) : "0"(n0), "1"(n1), "rm"(d)); > return q; > -#elif defined(__s390x__) > - /* Need to use a TImode type to get an even register pair for DLGR. */ > - unsigned __int128 n = (unsigned __int128)n1 << 64 | n0; > - asm("dlgr %0, %1" : "+r"(n) : "r"(d)); > - *r = n >> 64; > - return n; > #elif defined(_ARCH_PPC64) && defined(_ARCH_PWR7) > /* From Power ISA 2.06, programming note for divdeu. */ > uint64_t q1, q2, Q, r1, r2, R; > @@ -663,6 +657,10 @@ static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1, > } > *r = R; > return Q; > +#elif defined(CONFIG_INT128) > + unsigned __int128 n = (unsigned __int128)n1 << 64 | n0; > + *r = n % d; > + return n / d; > #else > uint64_t d0, d1, q0, q1, r1, r0, m; No, please don't. Use my !defined(__clang__) patch instead, please. Thomas
Richard Henderson <richard.henderson@linaro.org> writes: > On 1/17/19 7:23 AM, Alex Bennée wrote: >> Apparently some versions of clang can't handle inline assembly with >> __int128 parameters, especially on s390. Instead of hand-coding the >> s390 divide provide a generic fallback for anything that provides >> __int128 capable maths. >> >> Signed-off-by: Alex Bennée <alex.bennee@linaro.org> >> Cc: Thomas Huth <thuth@redhat.com> >> --- >> include/fpu/softfloat-macros.h | 10 ++++------ >> 1 file changed, 4 insertions(+), 6 deletions(-) >> >> diff --git a/include/fpu/softfloat-macros.h b/include/fpu/softfloat-macros.h >> index b1d772e6d4..1a43609eef 100644 >> --- a/include/fpu/softfloat-macros.h >> +++ b/include/fpu/softfloat-macros.h >> @@ -641,12 +641,6 @@ static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1, >> uint64_t q; >> asm("divq %4" : "=a"(q), "=d"(*r) : "0"(n0), "1"(n1), "rm"(d)); >> return q; >> -#elif defined(__s390x__) >> - /* Need to use a TImode type to get an even register pair for DLGR. */ >> - unsigned __int128 n = (unsigned __int128)n1 << 64 | n0; >> - asm("dlgr %0, %1" : "+r"(n) : "r"(d)); >> - *r = n >> 64; >> - return n; >> #elif defined(_ARCH_PPC64) && defined(_ARCH_PWR7) >> /* From Power ISA 2.06, programming note for divdeu. */ >> uint64_t q1, q2, Q, r1, r2, R; >> @@ -663,6 +657,10 @@ static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1, >> } >> *r = R; >> return Q; >> +#elif defined(CONFIG_INT128) >> + unsigned __int128 n = (unsigned __int128)n1 << 64 | n0; >> + *r = n % d; >> + return n / d; >> #else > > I thought that we'd shown that, at least at present, no compiler is taking > advantage of hardware insns for this, and is promoting this to a full 128-bit > divide. And further that the version using 64-bit arithmetic was competitive > with the hardware insn. Yeah it seems so. While Thomas' numbers weren't convincing the CONFIG_INT128 fallback did trigger on my SynQuacer an knocked off about 2 MFlops of it's admittedly slow performance. Amusingly of course it's faster under translation because of the hardware fall back: 07:44:44 [alex@idun:~/l/q/t/fp] (8973c1e5…) + ./fp-bench -o div -p double 13.28 MFlops 07:44:49 [alex@idun:~/l/q/t/fp] (8973c1e5…) + ./fp-bench -o div -p double -t host 498.20 MFlops 07:44:53 [alex@idun:~/l/q/t/fp] (8973c1e5…) + ../../aarch64-linux-user/qemu-aarch64 ./fp-bench -o div -p double -t host 52.71 MFlops I'll drop this and use Thomas' #elif defined(__s390x__) && !defined(__clang__) version in the pull-request. -- Alex Bennée
diff --git a/include/fpu/softfloat-macros.h b/include/fpu/softfloat-macros.h index b1d772e6d4..1a43609eef 100644 --- a/include/fpu/softfloat-macros.h +++ b/include/fpu/softfloat-macros.h @@ -641,12 +641,6 @@ static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1, uint64_t q; asm("divq %4" : "=a"(q), "=d"(*r) : "0"(n0), "1"(n1), "rm"(d)); return q; -#elif defined(__s390x__) - /* Need to use a TImode type to get an even register pair for DLGR. */ - unsigned __int128 n = (unsigned __int128)n1 << 64 | n0; - asm("dlgr %0, %1" : "+r"(n) : "r"(d)); - *r = n >> 64; - return n; #elif defined(_ARCH_PPC64) && defined(_ARCH_PWR7) /* From Power ISA 2.06, programming note for divdeu. */ uint64_t q1, q2, Q, r1, r2, R; @@ -663,6 +657,10 @@ static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1, } *r = R; return Q; +#elif defined(CONFIG_INT128) + unsigned __int128 n = (unsigned __int128)n1 << 64 | n0; + *r = n % d; + return n / d; #else uint64_t d0, d1, q0, q1, r1, r0, m;
Apparently some versions of clang can't handle inline assembly with __int128 parameters, especially on s390. Instead of hand-coding the s390 divide provide a generic fallback for anything that provides __int128 capable maths. Signed-off-by: Alex Bennée <alex.bennee@linaro.org> Cc: Thomas Huth <thuth@redhat.com> --- include/fpu/softfloat-macros.h | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) -- 2.17.1