Message ID | 1302535928-15901-1-git-send-email-peter.maydell@linaro.org |
---|---|
State | Accepted |
Commit | 4d9ad7f793605abd9806fc932b3e04e028894565 |
Headers | show |
On Mon, Apr 11, 2011 at 04:32:08PM +0100, Peter Maydell wrote: > In the VABAL instruction we take the absolute difference of two > values of size x and store it in a result of size 2x. This means > we have to be careful to calculate the absolute difference using > a wide enough type that we don't accidentally overflow. > > Signed-off-by: Peter Maydell <peter.maydell@linaro.org> > --- > target-arm/neon_helper.c | 38 +++++++++++++++++++++----------------- > 1 files changed, 21 insertions(+), 17 deletions(-) > > diff --git a/target-arm/neon_helper.c b/target-arm/neon_helper.c > index c3ac96a..7df925a 100644 > --- a/target-arm/neon_helper.c > +++ b/target-arm/neon_helper.c > @@ -1514,9 +1514,13 @@ uint64_t HELPER(neon_addl_saturate_s64)(uint64_t a, uint64_t b) > return result; > } > > -#define DO_ABD(dest, x, y, type) do { \ > - type tmp_x = x; \ > - type tmp_y = y; \ > +/* We have to do the arithmetic in a larger type than > + * the input type, because for example with a signed 32 bit > + * op the absolute difference can overflow a signed 32 bit value. > + */ > +#define DO_ABD(dest, x, y, intype, arithtype) do { \ > + arithtype tmp_x = (intype)(x); \ > + arithtype tmp_y = (intype)(y); \ > dest = ((tmp_x > tmp_y) ? tmp_x - tmp_y : tmp_y - tmp_x); \ > } while(0) > > @@ -1524,12 +1528,12 @@ uint64_t HELPER(neon_abdl_u16)(uint32_t a, uint32_t b) > { > uint64_t tmp; > uint64_t result; > - DO_ABD(result, a, b, uint8_t); > - DO_ABD(tmp, a >> 8, b >> 8, uint8_t); > + DO_ABD(result, a, b, uint8_t, uint32_t); > + DO_ABD(tmp, a >> 8, b >> 8, uint8_t, uint32_t); > result |= tmp << 16; > - DO_ABD(tmp, a >> 16, b >> 16, uint8_t); > + DO_ABD(tmp, a >> 16, b >> 16, uint8_t, uint32_t); > result |= tmp << 32; > - DO_ABD(tmp, a >> 24, b >> 24, uint8_t); > + DO_ABD(tmp, a >> 24, b >> 24, uint8_t, uint32_t); > result |= tmp << 48; > return result; > } Do we really need a 32-bit type for the computation here? > @@ -1538,12 +1542,12 @@ uint64_t HELPER(neon_abdl_s16)(uint32_t a, uint32_t b) > { > uint64_t tmp; > uint64_t result; > - DO_ABD(result, a, b, int8_t); > - DO_ABD(tmp, a >> 8, b >> 8, int8_t); > + DO_ABD(result, a, b, int8_t, int32_t); > + DO_ABD(tmp, a >> 8, b >> 8, int8_t, int32_t); > result |= tmp << 16; > - DO_ABD(tmp, a >> 16, b >> 16, int8_t); > + DO_ABD(tmp, a >> 16, b >> 16, int8_t, int32_t); > result |= tmp << 32; > - DO_ABD(tmp, a >> 24, b >> 24, int8_t); > + DO_ABD(tmp, a >> 24, b >> 24, int8_t, int32_t); > result |= tmp << 48; > return result; > } Ditto. > @@ -1552,8 +1556,8 @@ uint64_t HELPER(neon_abdl_u32)(uint32_t a, uint32_t b) > { > uint64_t tmp; > uint64_t result; > - DO_ABD(result, a, b, uint16_t); > - DO_ABD(tmp, a >> 16, b >> 16, uint16_t); > + DO_ABD(result, a, b, uint16_t, uint32_t); > + DO_ABD(tmp, a >> 16, b >> 16, uint16_t, uint32_t); > return result | (tmp << 32); > } > > @@ -1561,22 +1565,22 @@ uint64_t HELPER(neon_abdl_s32)(uint32_t a, uint32_t b) > { > uint64_t tmp; > uint64_t result; > - DO_ABD(result, a, b, int16_t); > - DO_ABD(tmp, a >> 16, b >> 16, int16_t); > + DO_ABD(result, a, b, int16_t, int32_t); > + DO_ABD(tmp, a >> 16, b >> 16, int16_t, int32_t); > return result | (tmp << 32); > } > > uint64_t HELPER(neon_abdl_u64)(uint32_t a, uint32_t b) > { > uint64_t result; > - DO_ABD(result, a, b, uint32_t); > + DO_ABD(result, a, b, uint32_t, uint64_t); > return result; > } > > uint64_t HELPER(neon_abdl_s64)(uint32_t a, uint32_t b) > { > uint64_t result; > - DO_ABD(result, a, b, int32_t); > + DO_ABD(result, a, b, int32_t, int64_t); > return result; > } > #undef DO_ABD All the others looks fine.
On 12 April 2011 22:32, Aurelien Jarno <aurelien@aurel32.net> wrote: > On Mon, Apr 11, 2011 at 04:32:08PM +0100, Peter Maydell wrote: >> @@ -1524,12 +1528,12 @@ uint64_t HELPER(neon_abdl_u16)(uint32_t a, uint32_t b) >> { >> uint64_t tmp; >> uint64_t result; >> - DO_ABD(result, a, b, uint8_t); >> - DO_ABD(tmp, a >> 8, b >> 8, uint8_t); >> + DO_ABD(result, a, b, uint8_t, uint32_t); >> + DO_ABD(tmp, a >> 8, b >> 8, uint8_t, uint32_t); >> result |= tmp << 16; >> - DO_ABD(tmp, a >> 16, b >> 16, uint8_t); >> + DO_ABD(tmp, a >> 16, b >> 16, uint8_t, uint32_t); >> result |= tmp << 32; >> - DO_ABD(tmp, a >> 24, b >> 24, uint8_t); >> + DO_ABD(tmp, a >> 24, b >> 24, uint8_t, uint32_t); >> result |= tmp << 48; >> return result; >> } > > Do we really need a 32-bit type for the computation here? No, anything wider than 8 will do, but my guess was that in practice 32 bits would be fractionally more efficient than unnecessarily forcing 16 bit arithmetic. For that matter I guess we could just say "int" and "unsigned int" since C guarantees us at least 16 bits there. -- PMM
On Tue, Apr 12, 2011 at 11:31:20PM +0100, Peter Maydell wrote: > On 12 April 2011 22:32, Aurelien Jarno <aurelien@aurel32.net> wrote: > > On Mon, Apr 11, 2011 at 04:32:08PM +0100, Peter Maydell wrote: > > >> @@ -1524,12 +1528,12 @@ uint64_t HELPER(neon_abdl_u16)(uint32_t a, uint32_t b) > >> { > >> uint64_t tmp; > >> uint64_t result; > >> - DO_ABD(result, a, b, uint8_t); > >> - DO_ABD(tmp, a >> 8, b >> 8, uint8_t); > >> + DO_ABD(result, a, b, uint8_t, uint32_t); > >> + DO_ABD(tmp, a >> 8, b >> 8, uint8_t, uint32_t); > >> result |= tmp << 16; > >> - DO_ABD(tmp, a >> 16, b >> 16, uint8_t); > >> + DO_ABD(tmp, a >> 16, b >> 16, uint8_t, uint32_t); > >> result |= tmp << 32; > >> - DO_ABD(tmp, a >> 24, b >> 24, uint8_t); > >> + DO_ABD(tmp, a >> 24, b >> 24, uint8_t, uint32_t); > >> result |= tmp << 48; > >> return result; > >> } > > > > Do we really need a 32-bit type for the computation here? > > No, anything wider than 8 will do, but my guess was that in > practice 32 bits would be fractionally more efficient than > unnecessarily forcing 16 bit arithmetic. For that matter I > guess we could just say "int" and "unsigned int" since C > guarantees us at least 16 bits there. My guess was that in 2011 a compiler can optimize that itself if it is faster, and so it should be presented the size that is really needed. It turns to be the case on x86_64, but not on arm or ia64. I have therefore applied this patch as is.
diff --git a/target-arm/neon_helper.c b/target-arm/neon_helper.c index c3ac96a..7df925a 100644 --- a/target-arm/neon_helper.c +++ b/target-arm/neon_helper.c @@ -1514,9 +1514,13 @@ uint64_t HELPER(neon_addl_saturate_s64)(uint64_t a, uint64_t b) return result; } -#define DO_ABD(dest, x, y, type) do { \ - type tmp_x = x; \ - type tmp_y = y; \ +/* We have to do the arithmetic in a larger type than + * the input type, because for example with a signed 32 bit + * op the absolute difference can overflow a signed 32 bit value. + */ +#define DO_ABD(dest, x, y, intype, arithtype) do { \ + arithtype tmp_x = (intype)(x); \ + arithtype tmp_y = (intype)(y); \ dest = ((tmp_x > tmp_y) ? tmp_x - tmp_y : tmp_y - tmp_x); \ } while(0) @@ -1524,12 +1528,12 @@ uint64_t HELPER(neon_abdl_u16)(uint32_t a, uint32_t b) { uint64_t tmp; uint64_t result; - DO_ABD(result, a, b, uint8_t); - DO_ABD(tmp, a >> 8, b >> 8, uint8_t); + DO_ABD(result, a, b, uint8_t, uint32_t); + DO_ABD(tmp, a >> 8, b >> 8, uint8_t, uint32_t); result |= tmp << 16; - DO_ABD(tmp, a >> 16, b >> 16, uint8_t); + DO_ABD(tmp, a >> 16, b >> 16, uint8_t, uint32_t); result |= tmp << 32; - DO_ABD(tmp, a >> 24, b >> 24, uint8_t); + DO_ABD(tmp, a >> 24, b >> 24, uint8_t, uint32_t); result |= tmp << 48; return result; } @@ -1538,12 +1542,12 @@ uint64_t HELPER(neon_abdl_s16)(uint32_t a, uint32_t b) { uint64_t tmp; uint64_t result; - DO_ABD(result, a, b, int8_t); - DO_ABD(tmp, a >> 8, b >> 8, int8_t); + DO_ABD(result, a, b, int8_t, int32_t); + DO_ABD(tmp, a >> 8, b >> 8, int8_t, int32_t); result |= tmp << 16; - DO_ABD(tmp, a >> 16, b >> 16, int8_t); + DO_ABD(tmp, a >> 16, b >> 16, int8_t, int32_t); result |= tmp << 32; - DO_ABD(tmp, a >> 24, b >> 24, int8_t); + DO_ABD(tmp, a >> 24, b >> 24, int8_t, int32_t); result |= tmp << 48; return result; } @@ -1552,8 +1556,8 @@ uint64_t HELPER(neon_abdl_u32)(uint32_t a, uint32_t b) { uint64_t tmp; uint64_t result; - DO_ABD(result, a, b, uint16_t); - DO_ABD(tmp, a >> 16, b >> 16, uint16_t); + DO_ABD(result, a, b, uint16_t, uint32_t); + DO_ABD(tmp, a >> 16, b >> 16, uint16_t, uint32_t); return result | (tmp << 32); } @@ -1561,22 +1565,22 @@ uint64_t HELPER(neon_abdl_s32)(uint32_t a, uint32_t b) { uint64_t tmp; uint64_t result; - DO_ABD(result, a, b, int16_t); - DO_ABD(tmp, a >> 16, b >> 16, int16_t); + DO_ABD(result, a, b, int16_t, int32_t); + DO_ABD(tmp, a >> 16, b >> 16, int16_t, int32_t); return result | (tmp << 32); } uint64_t HELPER(neon_abdl_u64)(uint32_t a, uint32_t b) { uint64_t result; - DO_ABD(result, a, b, uint32_t); + DO_ABD(result, a, b, uint32_t, uint64_t); return result; } uint64_t HELPER(neon_abdl_s64)(uint32_t a, uint32_t b) { uint64_t result; - DO_ABD(result, a, b, int32_t); + DO_ABD(result, a, b, int32_t, int64_t); return result; } #undef DO_ABD
In the VABAL instruction we take the absolute difference of two values of size x and store it in a result of size 2x. This means we have to be careful to calculate the absolute difference using a wide enough type that we don't accidentally overflow. Signed-off-by: Peter Maydell <peter.maydell@linaro.org> --- target-arm/neon_helper.c | 38 +++++++++++++++++++++----------------- 1 files changed, 21 insertions(+), 17 deletions(-)