Message ID | CAKdteObQJA=UH0snnyPBnOJxQDaeaQMGxYDQj56=do3=FF4ZmQ@mail.gmail.com |
---|---|
State | Accepted |
Commit | 052ef81d2dbef6727f9684733cfbb70f7adcf066 |
Headers | show |
On Mon, Dec 12, 2016 at 05:03:31PM +0100, Christophe Lyon wrote: > Hi, > > After the recent update from Tamar, I noticed a few discrepancies > between ARM and AArch64 regarding a few poly64 intrinsics. > > This patch: > - adds vtst_p64 and vtstq_p64 to AArch64's arm_neon.h > - adds vgetq_lane_p64, vset_lane_p64 and vsetq_lane_p64 to ARM's arm_neon.h > ( vget_lane_p64 was already there) > - adds the corresponding tests, and moves the vget_lane_p64 ones out > of the #ifdef __aarch64__ zone. > > Cross-tested on arm* and aarch64* targets. > > OK? The AArch64 parts of this look fine to me, but I do have one question on your inline assembly implementation for vtstq_p64: > +__extension__ extern __inline uint64x2_t > +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > +vtstq_p64 (poly64x2_t a, poly64x2_t b) > +{ > + uint64x2_t result; > + __asm__ ("cmtst %0.2d, %1.2d, %2.2d" > + : "=w"(result) > + : "w"(a), "w"(b) > + : /* No clobbers */); > + return result; > +} > + Why can this not be written as many of the other vtstq intrinsics are; e.g.: __extension__ extern __inline uint64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vtstq_p64 (poly64x2_t __a, poly64x2_t __b) { return (uint64x2_t) ((((uint64x2_t) __a) & ((uint64x2_t) __b)) != __AARCH64_INT64_C (0)); } Thanks, James > gcc/ChangeLog: > > 2016-12-12 Christophe Lyon <christophe.lyon@linaro.org> > > * config/aarch64/arm_neon.h (vtst_p64): New. > (vtstq_p64): New. > * config/arm/arm_neon.h (vgetq_lane_p64): New. > (vset_lane_p64): New. > (vsetq_lane_p64): New. > > gcc/testsuite/ChangeLog: > > 2016-12-12 Christophe Lyon <christophe.lyon@linaro.org> > > * gcc.target/aarch64/advsimd-intrinsics/p64_p128.c > (vget_lane_expected, vset_lane_expected, vtst_expected_poly64x1): > New. > (vmov_n_expected0, vmov_n_expected1, vmov_n_expected2) > (expected_vld_st2_0, expected_vld_st2_1, expected_vld_st3_0) > (expected_vld_st3_1, expected_vld_st3_2, expected_vld_st4_0) > (expected_vld_st4_1, expected_vld_st4_2, expected_vld_st4_3) > (vtst_expected_poly64x2): Move to aarch64-only section. > (vget_lane_p64, vgetq_lane_p64, vset_lane_p64, vsetq_lane_p64) > (vtst_p64, vtstq_p64): New tests. >
On 14 December 2016 at 17:55, James Greenhalgh <james.greenhalgh@arm.com> wrote: > On Mon, Dec 12, 2016 at 05:03:31PM +0100, Christophe Lyon wrote: >> Hi, >> >> After the recent update from Tamar, I noticed a few discrepancies >> between ARM and AArch64 regarding a few poly64 intrinsics. >> >> This patch: >> - adds vtst_p64 and vtstq_p64 to AArch64's arm_neon.h >> - adds vgetq_lane_p64, vset_lane_p64 and vsetq_lane_p64 to ARM's arm_neon.h >> ( vget_lane_p64 was already there) >> - adds the corresponding tests, and moves the vget_lane_p64 ones out >> of the #ifdef __aarch64__ zone. >> >> Cross-tested on arm* and aarch64* targets. >> >> OK? > > The AArch64 parts of this look fine to me, but I do have one question on > your inline assembly implementation for vtstq_p64: > >> +__extension__ extern __inline uint64x2_t >> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) >> +vtstq_p64 (poly64x2_t a, poly64x2_t b) >> +{ >> + uint64x2_t result; >> + __asm__ ("cmtst %0.2d, %1.2d, %2.2d" >> + : "=w"(result) >> + : "w"(a), "w"(b) >> + : /* No clobbers */); >> + return result; >> +} >> + > > Why can this not be written as many of the other vtstq intrinsics are; e.g.: > > __extension__ extern __inline uint64x2_t > __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > vtstq_p64 (poly64x2_t __a, poly64x2_t __b) > { > return (uint64x2_t) ((((uint64x2_t) __a) & ((uint64x2_t) __b)) > != __AARCH64_INT64_C (0)); > } > I don't know, I just followed the pattern used for vtstq_p8 and vtstq_p16 just above... > Thanks, > James > >> gcc/ChangeLog: >> >> 2016-12-12 Christophe Lyon <christophe.lyon@linaro.org> >> >> * config/aarch64/arm_neon.h (vtst_p64): New. >> (vtstq_p64): New. >> * config/arm/arm_neon.h (vgetq_lane_p64): New. >> (vset_lane_p64): New. >> (vsetq_lane_p64): New. >> >> gcc/testsuite/ChangeLog: >> >> 2016-12-12 Christophe Lyon <christophe.lyon@linaro.org> >> >> * gcc.target/aarch64/advsimd-intrinsics/p64_p128.c >> (vget_lane_expected, vset_lane_expected, vtst_expected_poly64x1): >> New. >> (vmov_n_expected0, vmov_n_expected1, vmov_n_expected2) >> (expected_vld_st2_0, expected_vld_st2_1, expected_vld_st3_0) >> (expected_vld_st3_1, expected_vld_st3_2, expected_vld_st4_0) >> (expected_vld_st4_1, expected_vld_st4_2, expected_vld_st4_3) >> (vtst_expected_poly64x2): Move to aarch64-only section. >> (vget_lane_p64, vgetq_lane_p64, vset_lane_p64, vsetq_lane_p64) >> (vtst_p64, vtstq_p64): New tests. >> > >
Ping? On 14 December 2016 at 23:09, Christophe Lyon <christophe.lyon@linaro.org> wrote: > On 14 December 2016 at 17:55, James Greenhalgh <james.greenhalgh@arm.com> wrote: >> On Mon, Dec 12, 2016 at 05:03:31PM +0100, Christophe Lyon wrote: >>> Hi, >>> >>> After the recent update from Tamar, I noticed a few discrepancies >>> between ARM and AArch64 regarding a few poly64 intrinsics. >>> >>> This patch: >>> - adds vtst_p64 and vtstq_p64 to AArch64's arm_neon.h >>> - adds vgetq_lane_p64, vset_lane_p64 and vsetq_lane_p64 to ARM's arm_neon.h >>> ( vget_lane_p64 was already there) >>> - adds the corresponding tests, and moves the vget_lane_p64 ones out >>> of the #ifdef __aarch64__ zone. >>> >>> Cross-tested on arm* and aarch64* targets. >>> >>> OK? >> >> The AArch64 parts of this look fine to me, but I do have one question on >> your inline assembly implementation for vtstq_p64: >> >>> +__extension__ extern __inline uint64x2_t >>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) >>> +vtstq_p64 (poly64x2_t a, poly64x2_t b) >>> +{ >>> + uint64x2_t result; >>> + __asm__ ("cmtst %0.2d, %1.2d, %2.2d" >>> + : "=w"(result) >>> + : "w"(a), "w"(b) >>> + : /* No clobbers */); >>> + return result; >>> +} >>> + >> >> Why can this not be written as many of the other vtstq intrinsics are; e.g.: >> >> __extension__ extern __inline uint64x2_t >> __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) >> vtstq_p64 (poly64x2_t __a, poly64x2_t __b) >> { >> return (uint64x2_t) ((((uint64x2_t) __a) & ((uint64x2_t) __b)) >> != __AARCH64_INT64_C (0)); >> } >> > > I don't know, I just followed the pattern used for vtstq_p8 and vtstq_p16 > just above... > > >> Thanks, >> James >> >>> gcc/ChangeLog: >>> >>> 2016-12-12 Christophe Lyon <christophe.lyon@linaro.org> >>> >>> * config/aarch64/arm_neon.h (vtst_p64): New. >>> (vtstq_p64): New. >>> * config/arm/arm_neon.h (vgetq_lane_p64): New. >>> (vset_lane_p64): New. >>> (vsetq_lane_p64): New. >>> >>> gcc/testsuite/ChangeLog: >>> >>> 2016-12-12 Christophe Lyon <christophe.lyon@linaro.org> >>> >>> * gcc.target/aarch64/advsimd-intrinsics/p64_p128.c >>> (vget_lane_expected, vset_lane_expected, vtst_expected_poly64x1): >>> New. >>> (vmov_n_expected0, vmov_n_expected1, vmov_n_expected2) >>> (expected_vld_st2_0, expected_vld_st2_1, expected_vld_st3_0) >>> (expected_vld_st3_1, expected_vld_st3_2, expected_vld_st4_0) >>> (expected_vld_st4_1, expected_vld_st4_2, expected_vld_st4_3) >>> (vtst_expected_poly64x2): Move to aarch64-only section. >>> (vget_lane_p64, vgetq_lane_p64, vset_lane_p64, vsetq_lane_p64) >>> (vtst_p64, vtstq_p64): New tests. >>> >> >>
Ping? James, I'm not sure whether your comment was a request for a new version of my patch or just FYI? On 3 January 2017 at 16:47, Christophe Lyon <christophe.lyon@linaro.org> wrote: > Ping? > > > On 14 December 2016 at 23:09, Christophe Lyon > <christophe.lyon@linaro.org> wrote: >> On 14 December 2016 at 17:55, James Greenhalgh <james.greenhalgh@arm.com> wrote: >>> On Mon, Dec 12, 2016 at 05:03:31PM +0100, Christophe Lyon wrote: >>>> Hi, >>>> >>>> After the recent update from Tamar, I noticed a few discrepancies >>>> between ARM and AArch64 regarding a few poly64 intrinsics. >>>> >>>> This patch: >>>> - adds vtst_p64 and vtstq_p64 to AArch64's arm_neon.h >>>> - adds vgetq_lane_p64, vset_lane_p64 and vsetq_lane_p64 to ARM's arm_neon.h >>>> ( vget_lane_p64 was already there) >>>> - adds the corresponding tests, and moves the vget_lane_p64 ones out >>>> of the #ifdef __aarch64__ zone. >>>> >>>> Cross-tested on arm* and aarch64* targets. >>>> >>>> OK? >>> >>> The AArch64 parts of this look fine to me, but I do have one question on >>> your inline assembly implementation for vtstq_p64: >>> >>>> +__extension__ extern __inline uint64x2_t >>>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) >>>> +vtstq_p64 (poly64x2_t a, poly64x2_t b) >>>> +{ >>>> + uint64x2_t result; >>>> + __asm__ ("cmtst %0.2d, %1.2d, %2.2d" >>>> + : "=w"(result) >>>> + : "w"(a), "w"(b) >>>> + : /* No clobbers */); >>>> + return result; >>>> +} >>>> + >>> >>> Why can this not be written as many of the other vtstq intrinsics are; e.g.: >>> >>> __extension__ extern __inline uint64x2_t >>> __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) >>> vtstq_p64 (poly64x2_t __a, poly64x2_t __b) >>> { >>> return (uint64x2_t) ((((uint64x2_t) __a) & ((uint64x2_t) __b)) >>> != __AARCH64_INT64_C (0)); >>> } >>> >> >> I don't know, I just followed the pattern used for vtstq_p8 and vtstq_p16 >> just above... >> >> >>> Thanks, >>> James >>> >>>> gcc/ChangeLog: >>>> >>>> 2016-12-12 Christophe Lyon <christophe.lyon@linaro.org> >>>> >>>> * config/aarch64/arm_neon.h (vtst_p64): New. >>>> (vtstq_p64): New. >>>> * config/arm/arm_neon.h (vgetq_lane_p64): New. >>>> (vset_lane_p64): New. >>>> (vsetq_lane_p64): New. >>>> >>>> gcc/testsuite/ChangeLog: >>>> >>>> 2016-12-12 Christophe Lyon <christophe.lyon@linaro.org> >>>> >>>> * gcc.target/aarch64/advsimd-intrinsics/p64_p128.c >>>> (vget_lane_expected, vset_lane_expected, vtst_expected_poly64x1): >>>> New. >>>> (vmov_n_expected0, vmov_n_expected1, vmov_n_expected2) >>>> (expected_vld_st2_0, expected_vld_st2_1, expected_vld_st3_0) >>>> (expected_vld_st3_1, expected_vld_st3_2, expected_vld_st4_0) >>>> (expected_vld_st4_1, expected_vld_st4_2, expected_vld_st4_3) >>>> (vtst_expected_poly64x2): Move to aarch64-only section. >>>> (vget_lane_p64, vgetq_lane_p64, vset_lane_p64, vsetq_lane_p64) >>>> (vtst_p64, vtstq_p64): New tests. >>>> >>> >>>
Hello, Is it too late for this patch? On 11 January 2017 at 11:13, Christophe Lyon <christophe.lyon@linaro.org> wrote: > Ping? > > James, I'm not sure whether your comment was a request for a new > version of my patch or just FYI? > > > On 3 January 2017 at 16:47, Christophe Lyon <christophe.lyon@linaro.org> wrote: >> Ping? >> >> >> On 14 December 2016 at 23:09, Christophe Lyon >> <christophe.lyon@linaro.org> wrote: >>> On 14 December 2016 at 17:55, James Greenhalgh <james.greenhalgh@arm.com> wrote: >>>> On Mon, Dec 12, 2016 at 05:03:31PM +0100, Christophe Lyon wrote: >>>>> Hi, >>>>> >>>>> After the recent update from Tamar, I noticed a few discrepancies >>>>> between ARM and AArch64 regarding a few poly64 intrinsics. >>>>> >>>>> This patch: >>>>> - adds vtst_p64 and vtstq_p64 to AArch64's arm_neon.h >>>>> - adds vgetq_lane_p64, vset_lane_p64 and vsetq_lane_p64 to ARM's arm_neon.h >>>>> ( vget_lane_p64 was already there) >>>>> - adds the corresponding tests, and moves the vget_lane_p64 ones out >>>>> of the #ifdef __aarch64__ zone. >>>>> >>>>> Cross-tested on arm* and aarch64* targets. >>>>> >>>>> OK? >>>> >>>> The AArch64 parts of this look fine to me, but I do have one question on >>>> your inline assembly implementation for vtstq_p64: >>>> >>>>> +__extension__ extern __inline uint64x2_t >>>>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) >>>>> +vtstq_p64 (poly64x2_t a, poly64x2_t b) >>>>> +{ >>>>> + uint64x2_t result; >>>>> + __asm__ ("cmtst %0.2d, %1.2d, %2.2d" >>>>> + : "=w"(result) >>>>> + : "w"(a), "w"(b) >>>>> + : /* No clobbers */); >>>>> + return result; >>>>> +} >>>>> + >>>> >>>> Why can this not be written as many of the other vtstq intrinsics are; e.g.: >>>> >>>> __extension__ extern __inline uint64x2_t >>>> __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) >>>> vtstq_p64 (poly64x2_t __a, poly64x2_t __b) >>>> { >>>> return (uint64x2_t) ((((uint64x2_t) __a) & ((uint64x2_t) __b)) >>>> != __AARCH64_INT64_C (0)); >>>> } >>>> >>> >>> I don't know, I just followed the pattern used for vtstq_p8 and vtstq_p16 >>> just above... >>> >>> >>>> Thanks, >>>> James >>>> >>>>> gcc/ChangeLog: >>>>> >>>>> 2016-12-12 Christophe Lyon <christophe.lyon@linaro.org> >>>>> >>>>> * config/aarch64/arm_neon.h (vtst_p64): New. >>>>> (vtstq_p64): New. >>>>> * config/arm/arm_neon.h (vgetq_lane_p64): New. >>>>> (vset_lane_p64): New. >>>>> (vsetq_lane_p64): New. >>>>> >>>>> gcc/testsuite/ChangeLog: >>>>> >>>>> 2016-12-12 Christophe Lyon <christophe.lyon@linaro.org> >>>>> >>>>> * gcc.target/aarch64/advsimd-intrinsics/p64_p128.c >>>>> (vget_lane_expected, vset_lane_expected, vtst_expected_poly64x1): >>>>> New. >>>>> (vmov_n_expected0, vmov_n_expected1, vmov_n_expected2) >>>>> (expected_vld_st2_0, expected_vld_st2_1, expected_vld_st3_0) >>>>> (expected_vld_st3_1, expected_vld_st3_2, expected_vld_st4_0) >>>>> (expected_vld_st4_1, expected_vld_st4_2, expected_vld_st4_3) >>>>> (vtst_expected_poly64x2): Move to aarch64-only section. >>>>> (vget_lane_p64, vgetq_lane_p64, vset_lane_p64, vsetq_lane_p64) >>>>> (vtst_p64, vtstq_p64): New tests. >>>>> >>>> >>>>
On Wed, Jan 11, 2017 at 11:13:07AM +0100, Christophe Lyon wrote: > Ping? > > James, I'm not sure whether your comment was a request for a new > version of my patch or just FYI? Sorry that this was unclear. I was looking for a new version of the patch covering this comment. Otherwise we just have debt to go fix it in future. With the suggested change, the AArch64 parts of this patch are OK - adding missing intrinsics is very safe (even in Stage 4). Please post an updated patch, and give Richard and Marcus a reasonable amount of tiume to object to taking the patch this late. (and you need an AArch32 OK too). Thanks, James > > > On 3 January 2017 at 16:47, Christophe Lyon <christophe.lyon@linaro.org> wrote: > > Ping? > > > > > > On 14 December 2016 at 23:09, Christophe Lyon > > <christophe.lyon@linaro.org> wrote: > >> On 14 December 2016 at 17:55, James Greenhalgh <james.greenhalgh@arm.com> wrote: > >>> On Mon, Dec 12, 2016 at 05:03:31PM +0100, Christophe Lyon wrote: > >>>> Hi, > >>>> > >>>> After the recent update from Tamar, I noticed a few discrepancies > >>>> between ARM and AArch64 regarding a few poly64 intrinsics. > >>>> > >>>> This patch: > >>>> - adds vtst_p64 and vtstq_p64 to AArch64's arm_neon.h > >>>> - adds vgetq_lane_p64, vset_lane_p64 and vsetq_lane_p64 to ARM's arm_neon.h > >>>> ( vget_lane_p64 was already there) > >>>> - adds the corresponding tests, and moves the vget_lane_p64 ones out > >>>> of the #ifdef __aarch64__ zone. > >>>> > >>>> Cross-tested on arm* and aarch64* targets. > >>>> > >>>> OK? > >>> > >>> The AArch64 parts of this look fine to me, but I do have one question on > >>> your inline assembly implementation for vtstq_p64: > >>> > >>>> +__extension__ extern __inline uint64x2_t > >>>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > >>>> +vtstq_p64 (poly64x2_t a, poly64x2_t b) > >>>> +{ > >>>> + uint64x2_t result; > >>>> + __asm__ ("cmtst %0.2d, %1.2d, %2.2d" > >>>> + : "=w"(result) > >>>> + : "w"(a), "w"(b) > >>>> + : /* No clobbers */); > >>>> + return result; > >>>> +} > >>>> + > >>> > >>> Why can this not be written as many of the other vtstq intrinsics are; e.g.: > >>> > >>> __extension__ extern __inline uint64x2_t > >>> __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > >>> vtstq_p64 (poly64x2_t __a, poly64x2_t __b) > >>> { > >>> return (uint64x2_t) ((((uint64x2_t) __a) & ((uint64x2_t) __b)) > >>> != __AARCH64_INT64_C (0)); > >>> } > >>> > >> > >> I don't know, I just followed the pattern used for vtstq_p8 and vtstq_p16 > >> just above... > >> > >> > >>> Thanks, > >>> James > >>> > >>>> gcc/ChangeLog: > >>>> > >>>> 2016-12-12 Christophe Lyon <christophe.lyon@linaro.org> > >>>> > >>>> * config/aarch64/arm_neon.h (vtst_p64): New. > >>>> (vtstq_p64): New. > >>>> * config/arm/arm_neon.h (vgetq_lane_p64): New. > >>>> (vset_lane_p64): New. > >>>> (vsetq_lane_p64): New. > >>>> > >>>> gcc/testsuite/ChangeLog: > >>>> > >>>> 2016-12-12 Christophe Lyon <christophe.lyon@linaro.org> > >>>> > >>>> * gcc.target/aarch64/advsimd-intrinsics/p64_p128.c > >>>> (vget_lane_expected, vset_lane_expected, vtst_expected_poly64x1): > >>>> New. > >>>> (vmov_n_expected0, vmov_n_expected1, vmov_n_expected2) > >>>> (expected_vld_st2_0, expected_vld_st2_1, expected_vld_st3_0) > >>>> (expected_vld_st3_1, expected_vld_st3_2, expected_vld_st4_0) > >>>> (expected_vld_st4_1, expected_vld_st4_2, expected_vld_st4_3) > >>>> (vtst_expected_poly64x2): Move to aarch64-only section. > >>>> (vget_lane_p64, vgetq_lane_p64, vset_lane_p64, vsetq_lane_p64) > >>>> (vtst_p64, vtstq_p64): New tests. > >>>> > >>> > >>>
On 4 February 2017 at 22:54, James Greenhalgh <james.greenhalgh@arm.com> wrote: > On Wed, Jan 11, 2017 at 11:13:07AM +0100, Christophe Lyon wrote: >> Ping? >> >> James, I'm not sure whether your comment was a request for a new >> version of my patch or just FYI? > > Sorry that this was unclear. I was looking for a new version of the patch > covering this comment. Otherwise we just have debt to go fix it in future. > > With the suggested change, the AArch64 parts of this patch are OK - adding > missing intrinsics is very safe (even in Stage 4). > > Please post an updated patch, and give Richard and Marcus a reasonable > amount of tiume to object to taking the patch this late. (and you need an > AArch32 OK too). > > Thanks, > James > Hi James, Thanks for the clarification, here is an updated patch. I had to make a few changes after rebasing, and I also took the opportunity to rewrite the existing vtst_p8, vtst_p16, vtstq_p8 and vtstq_p16 without an asm() statement. As before, the aarch64 and aarch32 updates to arm_neon.h are independent, but I found it simpler to group them, as they imply updates to the same test. Tested as usual, cross-testing on several arm* and aarch64* configurations, no regression. OK? Thanks, Christophe >> >> >> On 3 January 2017 at 16:47, Christophe Lyon <christophe.lyon@linaro.org> wrote: >> > Ping? >> > >> > >> > On 14 December 2016 at 23:09, Christophe Lyon >> > <christophe.lyon@linaro.org> wrote: >> >> On 14 December 2016 at 17:55, James Greenhalgh <james.greenhalgh@arm.com> wrote: >> >>> On Mon, Dec 12, 2016 at 05:03:31PM +0100, Christophe Lyon wrote: >> >>>> Hi, >> >>>> >> >>>> After the recent update from Tamar, I noticed a few discrepancies >> >>>> between ARM and AArch64 regarding a few poly64 intrinsics. >> >>>> >> >>>> This patch: >> >>>> - adds vtst_p64 and vtstq_p64 to AArch64's arm_neon.h >> >>>> - adds vgetq_lane_p64, vset_lane_p64 and vsetq_lane_p64 to ARM's arm_neon.h >> >>>> ( vget_lane_p64 was already there) >> >>>> - adds the corresponding tests, and moves the vget_lane_p64 ones out >> >>>> of the #ifdef __aarch64__ zone. >> >>>> >> >>>> Cross-tested on arm* and aarch64* targets. >> >>>> >> >>>> OK? >> >>> >> >>> The AArch64 parts of this look fine to me, but I do have one question on >> >>> your inline assembly implementation for vtstq_p64: >> >>> >> >>>> +__extension__ extern __inline uint64x2_t >> >>>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) >> >>>> +vtstq_p64 (poly64x2_t a, poly64x2_t b) >> >>>> +{ >> >>>> + uint64x2_t result; >> >>>> + __asm__ ("cmtst %0.2d, %1.2d, %2.2d" >> >>>> + : "=w"(result) >> >>>> + : "w"(a), "w"(b) >> >>>> + : /* No clobbers */); >> >>>> + return result; >> >>>> +} >> >>>> + >> >>> >> >>> Why can this not be written as many of the other vtstq intrinsics are; e.g.: >> >>> >> >>> __extension__ extern __inline uint64x2_t >> >>> __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) >> >>> vtstq_p64 (poly64x2_t __a, poly64x2_t __b) >> >>> { >> >>> return (uint64x2_t) ((((uint64x2_t) __a) & ((uint64x2_t) __b)) >> >>> != __AARCH64_INT64_C (0)); >> >>> } >> >>> >> >> >> >> I don't know, I just followed the pattern used for vtstq_p8 and vtstq_p16 >> >> just above... >> >> >> >> >> >>> Thanks, >> >>> James >> >>> >> >>>> gcc/ChangeLog: >> >>>> >> >>>> 2016-12-12 Christophe Lyon <christophe.lyon@linaro.org> >> >>>> >> >>>> * config/aarch64/arm_neon.h (vtst_p64): New. >> >>>> (vtstq_p64): New. >> >>>> * config/arm/arm_neon.h (vgetq_lane_p64): New. >> >>>> (vset_lane_p64): New. >> >>>> (vsetq_lane_p64): New. >> >>>> >> >>>> gcc/testsuite/ChangeLog: >> >>>> >> >>>> 2016-12-12 Christophe Lyon <christophe.lyon@linaro.org> >> >>>> >> >>>> * gcc.target/aarch64/advsimd-intrinsics/p64_p128.c >> >>>> (vget_lane_expected, vset_lane_expected, vtst_expected_poly64x1): >> >>>> New. >> >>>> (vmov_n_expected0, vmov_n_expected1, vmov_n_expected2) >> >>>> (expected_vld_st2_0, expected_vld_st2_1, expected_vld_st3_0) >> >>>> (expected_vld_st3_1, expected_vld_st3_2, expected_vld_st4_0) >> >>>> (expected_vld_st4_1, expected_vld_st4_2, expected_vld_st4_3) >> >>>> (vtst_expected_poly64x2): Move to aarch64-only section. >> >>>> (vget_lane_p64, vgetq_lane_p64, vset_lane_p64, vsetq_lane_p64) >> >>>> (vtst_p64, vtstq_p64): New tests. >> >>>> >> >>> >> >>> > gcc/ChangeLog: 2017-02-06 Christophe Lyon <christophe.lyon@linaro.org> * config/aarch64/arm_neon.h (vtst_p8): Rewrite without asm. (vtst_p16): Likewise. (vtstq_p8): Likewise. (vtstq_p16): Likewise. (vtst_p64): New. (vtstq_p64): Likewise. * config/arm/arm_neon.h (vgetq_lane_p64): New. (vset_lane_p64): New. (vsetq_lane_p64): New. gcc/testsuite/ChangeLog: 2017-02-06 Christophe Lyon <christophe.lyon@linaro.org> * gcc.target/aarch64/advsimd-intrinsics/p64_p128.c (vget_lane_expected, vset_lane_expected, vtst_expected_poly64x1): New. (vmov_n_expected0, vmov_n_expected1, vmov_n_expected2) (expected_vld_st2_0, expected_vld_st2_1, expected_vld_st3_0) (expected_vld_st3_1, expected_vld_st3_2, expected_vld_st4_0) (expected_vld_st4_1, expected_vld_st4_2, expected_vld_st4_3) (vtst_expected_poly64x2): Move to aarch64-only section. (vget_lane_p64, vgetq_lane_p64, vset_lane_p64, vsetq_lane_p64) (vtst_p64, vtstq_p64): New tests. commit d8eebfd0523115ad743a3a273f6dbf79e3d41d5c Author: Christophe Lyon <christophe.lyon@linaro.org> Date: Sun Feb 5 20:43:13 2017 +0000 ARM/AArch64: add missing poly64 intrinsics (vtst on aarch64, vget_lane on arm) Change-Id: I334e0fa6ab07d473609ed96d9ab8cb56ebd521acdiff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index a54c0be..0753da3 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -10862,48 +10862,47 @@ __extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vtst_p8 (poly8x8_t a, poly8x8_t b) { - uint8x8_t result; - __asm__ ("cmtst %0.8b, %1.8b, %2.8b" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; + return (uint8x8_t) ((((uint8x8_t) a) & ((uint8x8_t) b)) + != 0); } __extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vtst_p16 (poly16x4_t a, poly16x4_t b) { - uint16x4_t result; - __asm__ ("cmtst %0.4h, %1.4h, %2.4h" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; + return (uint16x4_t) ((((uint16x4_t) a) & ((uint16x4_t) b)) + != 0); +} + +__extension__ extern __inline uint64x1_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vtst_p64 (poly64x1_t a, poly64x1_t b) +{ + return (uint64x1_t) ((a & b) != __AARCH64_INT64_C (0)); } __extension__ extern __inline uint8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vtstq_p8 (poly8x16_t a, poly8x16_t b) { - uint8x16_t result; - __asm__ ("cmtst %0.16b, %1.16b, %2.16b" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; + return (uint8x16_t) ((((uint8x16_t) a) & ((uint8x16_t) b)) + != 0); } __extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vtstq_p16 (poly16x8_t a, poly16x8_t b) { - uint16x8_t result; - __asm__ ("cmtst %0.8h, %1.8h, %2.8h" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; + return (uint16x8_t) ((((uint16x8_t) a) & ((uint16x8_t) b)) + != 0); +} + +__extension__ extern __inline uint64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vtstq_p64 (poly64x2_t a, poly64x2_t b) +{ + return (uint64x2_t) ((((uint64x2_t) a) & ((uint64x2_t) b)) + != __AARCH64_INT64_C (0)); } /* End of temporary inline asm implementations. */ diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h index 16bf8c5..f81d77e 100644 --- a/gcc/config/arm/arm_neon.h +++ b/gcc/config/arm/arm_neon.h @@ -6309,6 +6309,16 @@ vgetq_lane_s64 (int64x2_t __a, const int __b) return (int64_t)__builtin_neon_vget_lanev2di (__a, __b); } +#pragma GCC push_options +#pragma GCC target ("fpu=crypto-neon-fp-armv8") +__extension__ extern __inline poly64_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vgetq_lane_p64 (poly64x2_t __a, const int __b) +{ + return (poly64_t)__builtin_neon_vget_lanev2di ((int64x2_t) __a, __b); +} + +#pragma GCC pop_options __extension__ extern __inline uint64_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vgetq_lane_u64 (uint64x2_t __a, const int __b) @@ -6405,6 +6415,16 @@ vset_lane_u64 (uint64_t __a, uint64x1_t __b, const int __c) return (uint64x1_t)__builtin_neon_vset_lanedi ((__builtin_neon_di) __a, (int64x1_t) __b, __c); } +#pragma GCC push_options +#pragma GCC target ("fpu=crypto-neon-fp-armv8") +__extension__ extern __inline poly64x1_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vset_lane_p64 (poly64_t __a, poly64x1_t __b, const int __c) +{ + return (poly64x1_t)__builtin_neon_vset_lanedi ((__builtin_neon_di) __a, (int64x1_t) __b, __c); +} + +#pragma GCC pop_options __extension__ extern __inline int8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vsetq_lane_s8 (int8_t __a, int8x16_t __b, const int __c) @@ -6496,6 +6516,13 @@ vsetq_lane_u64 (uint64_t __a, uint64x2_t __b, const int __c) #pragma GCC push_options #pragma GCC target ("fpu=crypto-neon-fp-armv8") +__extension__ extern __inline poly64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vsetq_lane_p64 (poly64_t __a, poly64x2_t __b, const int __c) +{ + return (poly64x2_t)__builtin_neon_vset_lanev2di ((__builtin_neon_di) __a, (int64x2_t) __b, __c); +} + __extension__ extern __inline poly64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vcreate_p64 (uint64_t __a) diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/p64_p128.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/p64_p128.c index 7c5bca2..a3210a9 100644 --- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/p64_p128.c +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/p64_p128.c @@ -39,17 +39,6 @@ VECT_VAR_DECL(vdup_n_expected2,poly,64,1) [] = { 0xfffffffffffffff2 }; VECT_VAR_DECL(vdup_n_expected2,poly,64,2) [] = { 0xfffffffffffffff2, 0xfffffffffffffff2 }; -/* Expected results: vmov_n. */ -VECT_VAR_DECL(vmov_n_expected0,poly,64,1) [] = { 0xfffffffffffffff0 }; -VECT_VAR_DECL(vmov_n_expected0,poly,64,2) [] = { 0xfffffffffffffff0, - 0xfffffffffffffff0 }; -VECT_VAR_DECL(vmov_n_expected1,poly,64,1) [] = { 0xfffffffffffffff1 }; -VECT_VAR_DECL(vmov_n_expected1,poly,64,2) [] = { 0xfffffffffffffff1, - 0xfffffffffffffff1 }; -VECT_VAR_DECL(vmov_n_expected2,poly,64,1) [] = { 0xfffffffffffffff2 }; -VECT_VAR_DECL(vmov_n_expected2,poly,64,2) [] = { 0xfffffffffffffff2, - 0xfffffffffffffff2 }; - /* Expected results: vext. */ VECT_VAR_DECL(vext_expected,poly,64,1) [] = { 0xfffffffffffffff0 }; VECT_VAR_DECL(vext_expected,poly,64,2) [] = { 0xfffffffffffffff1, 0x88 }; @@ -124,6 +113,29 @@ VECT_VAR_DECL(vst1_lane_expected,poly,64,1) [] = { 0xfffffffffffffff0 }; VECT_VAR_DECL(vst1_lane_expected,poly,64,2) [] = { 0xfffffffffffffff0, 0x3333333333333333 }; +/* Expected results: vget_lane. */ +VECT_VAR_DECL(vget_lane_expected,poly,64,1) = 0xfffffffffffffff0; +VECT_VAR_DECL(vget_lane_expected,poly,64,2) = 0xfffffffffffffff0; + +/* Expected results: vset_lane. */ +VECT_VAR_DECL(vset_lane_expected,poly,64,1) [] = { 0x88 }; +VECT_VAR_DECL(vset_lane_expected,poly,64,2) [] = { 0xfffffffffffffff0, 0x11 }; + +/* Expected results: vtst. */ +VECT_VAR_DECL(vtst_expected,uint,64,1) [] = { 0x0 }; + +#ifdef __aarch64__ +/* Expected results: vmov_n. */ +VECT_VAR_DECL(vmov_n_expected0,poly,64,1) [] = { 0xfffffffffffffff0 }; +VECT_VAR_DECL(vmov_n_expected0,poly,64,2) [] = { 0xfffffffffffffff0, + 0xfffffffffffffff0 }; +VECT_VAR_DECL(vmov_n_expected1,poly,64,1) [] = { 0xfffffffffffffff1 }; +VECT_VAR_DECL(vmov_n_expected1,poly,64,2) [] = { 0xfffffffffffffff1, + 0xfffffffffffffff1 }; +VECT_VAR_DECL(vmov_n_expected2,poly,64,1) [] = { 0xfffffffffffffff2 }; +VECT_VAR_DECL(vmov_n_expected2,poly,64,2) [] = { 0xfffffffffffffff2, + 0xfffffffffffffff2 }; + /* Expected results: vldX_lane. */ VECT_VAR_DECL(expected_vld_st2_0,poly,64,1) [] = { 0xfffffffffffffff0 }; VECT_VAR_DECL(expected_vld_st2_0,poly,64,2) [] = { 0xfffffffffffffff0, @@ -153,9 +165,9 @@ VECT_VAR_DECL(expected_vld_st4_3,poly,64,1) [] = { 0xfffffffffffffff3 }; VECT_VAR_DECL(expected_vld_st4_3,poly,64,2) [] = { 0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa }; -/* Expected results: vget_lane. */ -VECT_VAR_DECL(vget_lane_expected,poly,64,1) = 0xfffffffffffffff0; -VECT_VAR_DECL(vget_lane_expected,poly,64,2) = 0xfffffffffffffff0; +/* Expected results: vtst. */ +VECT_VAR_DECL(vtst_expected,uint,64,2) [] = { 0x0, 0xffffffffffffffff }; +#endif int main (void) { @@ -727,7 +739,105 @@ int main (void) CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vst1_lane_expected, ""); CHECK_POLY(TEST_MSG, poly, 64, 2, PRIx64, vst1_lane_expected, ""); + /* vget_lane_p64 tests. */ +#undef TEST_MSG +#define TEST_MSG "VGET_LANE/VGETQ_LANE" + +#define TEST_VGET_LANE(Q, T1, T2, W, N, L) \ + VECT_VAR(vget_lane_vector, T1, W, N) = vget##Q##_lane_##T2##W(VECT_VAR(vget_lane_vector1, T1, W, N), L); \ + if (VECT_VAR(vget_lane_vector, T1, W, N) != VECT_VAR(vget_lane_expected, T1, W, N)) { \ + fprintf(stderr, \ + "ERROR in %s (%s line %d in result '%s') at type %s " \ + "got 0x%" PRIx##W " != 0x%" PRIx##W "\n", \ + TEST_MSG, __FILE__, __LINE__, \ + STR(VECT_VAR(vget_lane_expected, T1, W, N)), \ + STR(VECT_NAME(T1, W, N)), \ + VECT_VAR(vget_lane_vector, T1, W, N), \ + VECT_VAR(vget_lane_expected, T1, W, N)); \ + abort (); \ + } + + /* Initialize input values. */ + DECL_VARIABLE(vget_lane_vector1, poly, 64, 1); + DECL_VARIABLE(vget_lane_vector1, poly, 64, 2); + + VLOAD(vget_lane_vector1, buffer, , poly, p, 64, 1); + VLOAD(vget_lane_vector1, buffer, q, poly, p, 64, 2); + + VECT_VAR_DECL(vget_lane_vector, poly, 64, 1); + VECT_VAR_DECL(vget_lane_vector, poly, 64, 2); + + TEST_VGET_LANE( , poly, p, 64, 1, 0); + TEST_VGET_LANE(q, poly, p, 64, 2, 0); + + + /* vset_lane_p64 tests. */ +#undef TEST_MSG +#define TEST_MSG "VSET_LANE/VSETQ_LANE" + +#define TEST_VSET_LANE(Q, T1, T2, W, N, V, L) \ + VECT_VAR(vset_lane_vector, T1, W, N) = \ + vset##Q##_lane_##T2##W(V, \ + VECT_VAR(vset_lane_vector, T1, W, N), \ + L); \ + vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N), VECT_VAR(vset_lane_vector, T1, W, N)) + + /* Initialize input values. */ + DECL_VARIABLE(vset_lane_vector, poly, 64, 1); + DECL_VARIABLE(vset_lane_vector, poly, 64, 2); + + CLEAN(result, uint, 64, 1); + CLEAN(result, uint, 64, 2); + + VLOAD(vset_lane_vector, buffer, , poly, p, 64, 1); + VLOAD(vset_lane_vector, buffer, q, poly, p, 64, 2); + + /* Choose value and lane arbitrarily. */ + TEST_VSET_LANE(, poly, p, 64, 1, 0x88, 0); + TEST_VSET_LANE(q, poly, p, 64, 2, 0x11, 1); + + CHECK(TEST_MSG, poly, 64, 1, PRIx64, vset_lane_expected, ""); + CHECK(TEST_MSG, poly, 64, 2, PRIx64, vset_lane_expected, ""); + + + /* vtst_p64 tests. */ +#undef TEST_MSG +#define TEST_MSG "VTST" + +#define TEST_VTST1(INSN, Q, T1, T2, W, N) \ + VECT_VAR(vtst_vector_res, uint, W, N) = \ + INSN##Q##_##T2##W(VECT_VAR(vtst_vector, T1, W, N), \ + VECT_VAR(vtst_vector2, T1, W, N)); \ + vst1##Q##_u##W(VECT_VAR(result, uint, W, N), \ + VECT_VAR(vtst_vector_res, uint, W, N)) + +#define TEST_VTST(INSN, Q, T1, T2, W, N) \ + TEST_VTST1(INSN, Q, T1, T2, W, N) \ + + /* Initialize input values. */ + DECL_VARIABLE(vtst_vector, poly, 64, 1); + DECL_VARIABLE(vtst_vector2, poly, 64, 1); + DECL_VARIABLE(vtst_vector_res, uint, 64, 1); + + CLEAN(result, uint, 64, 1); + + VLOAD(vtst_vector, buffer, , poly, p, 64, 1); + VDUP(vtst_vector2, , poly, p, 64, 1, 5); + + TEST_VTST(vtst, , poly, p, 64, 1); + + CHECK(TEST_MSG, uint, 64, 1, PRIx64, vtst_expected, ""); + + /* vtstq_p64 is supported by aarch64 only. */ #ifdef __aarch64__ + DECL_VARIABLE(vtst_vector, poly, 64, 2); + DECL_VARIABLE(vtst_vector2, poly, 64, 2); + DECL_VARIABLE(vtst_vector_res, uint, 64, 2); + CLEAN(result, uint, 64, 2); + VLOAD(vtst_vector, buffer, q, poly, p, 64, 2); + VDUP(vtst_vector2, q, poly, p, 64, 2, 5); + TEST_VTST(vtst, q, poly, p, 64, 2); + CHECK(TEST_MSG, uint, 64, 2, PRIx64, vtst_expected, ""); /* vmov_n_p64 tests. */ #undef TEST_MSG @@ -767,37 +877,6 @@ int main (void) } } - /* vget_lane_p64 tests. */ -#undef TEST_MSG -#define TEST_MSG "VGET_LANE/VGETQ_LANE" - -#define TEST_VGET_LANE(Q, T1, T2, W, N, L) \ - VECT_VAR(vget_lane_vector, T1, W, N) = vget##Q##_lane_##T2##W(VECT_VAR(vector, T1, W, N), L); \ - if (VECT_VAR(vget_lane_vector, T1, W, N) != VECT_VAR(vget_lane_expected, T1, W, N)) { \ - fprintf(stderr, \ - "ERROR in %s (%s line %d in result '%s') at type %s " \ - "got 0x%" PRIx##W " != 0x%" PRIx##W "\n", \ - TEST_MSG, __FILE__, __LINE__, \ - STR(VECT_VAR(vget_lane_expected, T1, W, N)), \ - STR(VECT_NAME(T1, W, N)), \ - (uint##W##_t)VECT_VAR(vget_lane_vector, T1, W, N), \ - (uint##W##_t)VECT_VAR(vget_lane_expected, T1, W, N)); \ - abort (); \ - } - - /* Initialize input values. */ - DECL_VARIABLE(vector, poly, 64, 1); - DECL_VARIABLE(vector, poly, 64, 2); - - VLOAD(vector, buffer, , poly, p, 64, 1); - VLOAD(vector, buffer, q, poly, p, 64, 2); - - VECT_VAR_DECL(vget_lane_vector, poly, 64, 1); - VECT_VAR_DECL(vget_lane_vector, poly, 64, 2); - - TEST_VGET_LANE( , poly, p, 64, 1, 0); - TEST_VGET_LANE(q, poly, p, 64, 2, 0); - /* vldx_lane_p64 tests. */ #undef TEST_MSG #define TEST_MSG "VLDX_LANE/VLDXQ_LANE"
Hi Christophe, On 06/02/17 08:31, Christophe Lyon wrote: > On 4 February 2017 at 22:54, James Greenhalgh <james.greenhalgh@arm.com> wrote: >> On Wed, Jan 11, 2017 at 11:13:07AM +0100, Christophe Lyon wrote: >>> Ping? >>> >>> James, I'm not sure whether your comment was a request for a new >>> version of my patch or just FYI? >> Sorry that this was unclear. I was looking for a new version of the patch >> covering this comment. Otherwise we just have debt to go fix it in future. >> >> With the suggested change, the AArch64 parts of this patch are OK - adding >> missing intrinsics is very safe (even in Stage 4). >> >> Please post an updated patch, and give Richard and Marcus a reasonable >> amount of tiume to object to taking the patch this late. (and you need an >> AArch32 OK too). >> >> Thanks, >> James >> > Hi James, > > Thanks for the clarification, here is an updated patch. > > I had to make a few changes after rebasing, and I also took the opportunity to > rewrite the existing vtst_p8, vtst_p16, vtstq_p8 and vtstq_p16 without an > asm() statement. > > As before, the aarch64 and aarch32 updates to arm_neon.h are independent, > but I found it simpler to group them, as they imply updates to the same test. > > Tested as usual, cross-testing on several arm* and aarch64* configurations, > no regression. > > OK? Ok for arm. Thanks, Kyrill > Thanks, > > Christophe > >>> >>> On 3 January 2017 at 16:47, Christophe Lyon <christophe.lyon@linaro.org> wrote: >>>> Ping? >>>> >>>> >>>> On 14 December 2016 at 23:09, Christophe Lyon >>>> <christophe.lyon@linaro.org> wrote: >>>>> On 14 December 2016 at 17:55, James Greenhalgh <james.greenhalgh@arm.com> wrote: >>>>>> On Mon, Dec 12, 2016 at 05:03:31PM +0100, Christophe Lyon wrote: >>>>>>> Hi, >>>>>>> >>>>>>> After the recent update from Tamar, I noticed a few discrepancies >>>>>>> between ARM and AArch64 regarding a few poly64 intrinsics. >>>>>>> >>>>>>> This patch: >>>>>>> - adds vtst_p64 and vtstq_p64 to AArch64's arm_neon.h >>>>>>> - adds vgetq_lane_p64, vset_lane_p64 and vsetq_lane_p64 to ARM's arm_neon.h >>>>>>> ( vget_lane_p64 was already there) >>>>>>> - adds the corresponding tests, and moves the vget_lane_p64 ones out >>>>>>> of the #ifdef __aarch64__ zone. >>>>>>> >>>>>>> Cross-tested on arm* and aarch64* targets. >>>>>>> >>>>>>> OK? >>>>>> The AArch64 parts of this look fine to me, but I do have one question on >>>>>> your inline assembly implementation for vtstq_p64: >>>>>> >>>>>>> +__extension__ extern __inline uint64x2_t >>>>>>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) >>>>>>> +vtstq_p64 (poly64x2_t a, poly64x2_t b) >>>>>>> +{ >>>>>>> + uint64x2_t result; >>>>>>> + __asm__ ("cmtst %0.2d, %1.2d, %2.2d" >>>>>>> + : "=w"(result) >>>>>>> + : "w"(a), "w"(b) >>>>>>> + : /* No clobbers */); >>>>>>> + return result; >>>>>>> +} >>>>>>> + >>>>>> Why can this not be written as many of the other vtstq intrinsics are; e.g.: >>>>>> >>>>>> __extension__ extern __inline uint64x2_t >>>>>> __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) >>>>>> vtstq_p64 (poly64x2_t __a, poly64x2_t __b) >>>>>> { >>>>>> return (uint64x2_t) ((((uint64x2_t) __a) & ((uint64x2_t) __b)) >>>>>> != __AARCH64_INT64_C (0)); >>>>>> } >>>>>> >>>>> I don't know, I just followed the pattern used for vtstq_p8 and vtstq_p16 >>>>> just above... >>>>> >>>>> >>>>>> Thanks, >>>>>> James >>>>>> >>>>>>> gcc/ChangeLog: >>>>>>> >>>>>>> 2016-12-12 Christophe Lyon <christophe.lyon@linaro.org> >>>>>>> >>>>>>> * config/aarch64/arm_neon.h (vtst_p64): New. >>>>>>> (vtstq_p64): New. >>>>>>> * config/arm/arm_neon.h (vgetq_lane_p64): New. >>>>>>> (vset_lane_p64): New. >>>>>>> (vsetq_lane_p64): New. >>>>>>> >>>>>>> gcc/testsuite/ChangeLog: >>>>>>> >>>>>>> 2016-12-12 Christophe Lyon <christophe.lyon@linaro.org> >>>>>>> >>>>>>> * gcc.target/aarch64/advsimd-intrinsics/p64_p128.c >>>>>>> (vget_lane_expected, vset_lane_expected, vtst_expected_poly64x1): >>>>>>> New. >>>>>>> (vmov_n_expected0, vmov_n_expected1, vmov_n_expected2) >>>>>>> (expected_vld_st2_0, expected_vld_st2_1, expected_vld_st3_0) >>>>>>> (expected_vld_st3_1, expected_vld_st3_2, expected_vld_st4_0) >>>>>>> (expected_vld_st4_1, expected_vld_st4_2, expected_vld_st4_3) >>>>>>> (vtst_expected_poly64x2): Move to aarch64-only section. >>>>>>> (vget_lane_p64, vgetq_lane_p64, vset_lane_p64, vsetq_lane_p64) >>>>>>> (vtst_p64, vtstq_p64): New tests. >>>>>>> >>>>>>
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index b846644..74d163e 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -10882,6 +10882,13 @@ vtst_p16 (poly16x4_t a, poly16x4_t b) return result; } +__extension__ extern __inline uint64x1_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vtst_p64 (poly64x1_t a, poly64x1_t b) +{ + return (uint64x1_t) ((a & b) != __AARCH64_INT64_C (0)); +} + __extension__ extern __inline uint8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vtstq_p8 (poly8x16_t a, poly8x16_t b) @@ -10906,6 +10913,18 @@ vtstq_p16 (poly16x8_t a, poly16x8_t b) return result; } +__extension__ extern __inline uint64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vtstq_p64 (poly64x2_t a, poly64x2_t b) +{ + uint64x2_t result; + __asm__ ("cmtst %0.2d, %1.2d, %2.2d" + : "=w"(result) + : "w"(a), "w"(b) + : /* No clobbers */); + return result; +} + /* End of temporary inline asm implementations. */ /* Start of temporary inline asm for vldn, vstn and friends. */ diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h index ab29da7..d199b41 100644 --- a/gcc/config/arm/arm_neon.h +++ b/gcc/config/arm/arm_neon.h @@ -5497,6 +5497,15 @@ vgetq_lane_s64 (int64x2_t __a, const int __b) return (int64_t)__builtin_neon_vget_lanev2di (__a, __b); } +#pragma GCC push_options +#pragma GCC target ("fpu=crypto-neon-fp-armv8") +__extension__ static __inline poly64_t __attribute__ ((__always_inline__)) +vgetq_lane_p64 (poly64x2_t __a, const int __b) +{ + return (poly64_t)__builtin_neon_vget_lanev2di ((int64x2_t) __a, __b); +} + +#pragma GCC pop_options __extension__ static __inline uint64_t __attribute__ ((__always_inline__)) vgetq_lane_u64 (uint64x2_t __a, const int __b) { @@ -5581,6 +5590,15 @@ vset_lane_u64 (uint64_t __a, uint64x1_t __b, const int __c) return (uint64x1_t)__builtin_neon_vset_lanedi ((__builtin_neon_di) __a, (int64x1_t) __b, __c); } +#pragma GCC push_options +#pragma GCC target ("fpu=crypto-neon-fp-armv8") +__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__)) +vset_lane_p64 (poly64_t __a, poly64x1_t __b, const int __c) +{ + return (poly64x1_t)__builtin_neon_vset_lanedi ((__builtin_neon_di) __a, (int64x1_t) __b, __c); +} + +#pragma GCC pop_options __extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) vsetq_lane_s8 (int8_t __a, int8x16_t __b, const int __c) { @@ -5661,6 +5679,12 @@ vsetq_lane_u64 (uint64_t __a, uint64x2_t __b, const int __c) #pragma GCC push_options #pragma GCC target ("fpu=crypto-neon-fp-armv8") +__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__)) +vsetq_lane_p64 (poly64_t __a, poly64x2_t __b, const int __c) +{ + return (poly64x2_t)__builtin_neon_vset_lanev2di ((__builtin_neon_di) __a, (int64x2_t) __b, __c); +} + __extension__ static __inline poly64x1_t __attribute__ ((__always_inline__)) vcreate_p64 (uint64_t __a) { diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/p64_p128.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/p64_p128.c index 8907b38..ba8fbeb 100644 --- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/p64_p128.c +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/p64_p128.c @@ -39,17 +39,6 @@ VECT_VAR_DECL(vdup_n_expected2,poly,64,1) [] = { 0xfffffffffffffff2 }; VECT_VAR_DECL(vdup_n_expected2,poly,64,2) [] = { 0xfffffffffffffff2, 0xfffffffffffffff2 }; -/* Expected results: vmov_n. */ -VECT_VAR_DECL(vmov_n_expected0,poly,64,1) [] = { 0xfffffffffffffff0 }; -VECT_VAR_DECL(vmov_n_expected0,poly,64,2) [] = { 0xfffffffffffffff0, - 0xfffffffffffffff0 }; -VECT_VAR_DECL(vmov_n_expected1,poly,64,1) [] = { 0xfffffffffffffff1 }; -VECT_VAR_DECL(vmov_n_expected1,poly,64,2) [] = { 0xfffffffffffffff1, - 0xfffffffffffffff1 }; -VECT_VAR_DECL(vmov_n_expected2,poly,64,1) [] = { 0xfffffffffffffff2 }; -VECT_VAR_DECL(vmov_n_expected2,poly,64,2) [] = { 0xfffffffffffffff2, - 0xfffffffffffffff2 }; - /* Expected results: vext. */ VECT_VAR_DECL(vext_expected,poly,64,1) [] = { 0xfffffffffffffff0 }; VECT_VAR_DECL(vext_expected,poly,64,2) [] = { 0xfffffffffffffff1, 0x88 }; @@ -124,6 +113,29 @@ VECT_VAR_DECL(vst1_lane_expected,poly,64,1) [] = { 0xfffffffffffffff0 }; VECT_VAR_DECL(vst1_lane_expected,poly,64,2) [] = { 0xfffffffffffffff0, 0x3333333333333333 }; +/* Expected results: vget_lane. */ +VECT_VAR_DECL(vget_lane_expected,poly,64,1) = 0xfffffffffffffff0; +VECT_VAR_DECL(vget_lane_expected,poly,64,2) = 0xfffffffffffffff0; + +/* Expected results: vset_lane. */ +VECT_VAR_DECL(vset_lane_expected,poly,64,1) [] = { 0x88 }; +VECT_VAR_DECL(vset_lane_expected,poly,64,2) [] = { 0xfffffffffffffff0, 0x11 }; + +/* Expected results: vtst. */ +VECT_VAR_DECL(vtst_expected,uint,64,1) [] = { 0x0 }; + +#ifdef __aarch64__ +/* Expected results: vmov_n. */ +VECT_VAR_DECL(vmov_n_expected0,poly,64,1) [] = { 0xfffffffffffffff0 }; +VECT_VAR_DECL(vmov_n_expected0,poly,64,2) [] = { 0xfffffffffffffff0, + 0xfffffffffffffff0 }; +VECT_VAR_DECL(vmov_n_expected1,poly,64,1) [] = { 0xfffffffffffffff1 }; +VECT_VAR_DECL(vmov_n_expected1,poly,64,2) [] = { 0xfffffffffffffff1, + 0xfffffffffffffff1 }; +VECT_VAR_DECL(vmov_n_expected2,poly,64,1) [] = { 0xfffffffffffffff2 }; +VECT_VAR_DECL(vmov_n_expected2,poly,64,2) [] = { 0xfffffffffffffff2, + 0xfffffffffffffff2 }; + /* Expected results: vldX_lane. */ VECT_VAR_DECL(expected_vld_st2_0,poly,64,1) [] = { 0xfffffffffffffff0 }; VECT_VAR_DECL(expected_vld_st2_0,poly,64,2) [] = { 0xfffffffffffffff0, @@ -153,9 +165,9 @@ VECT_VAR_DECL(expected_vld_st4_3,poly,64,1) [] = { 0xfffffffffffffff3 }; VECT_VAR_DECL(expected_vld_st4_3,poly,64,2) [] = { 0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa }; -/* Expected results: vget_lane. */ -VECT_VAR_DECL(vget_lane_expected,poly,64,1) = 0xfffffffffffffff0; -VECT_VAR_DECL(vget_lane_expected,poly,64,2) = 0xfffffffffffffff0; +/* Expected results: vtst. */ +VECT_VAR_DECL(vtst_expected,uint,64,2) [] = { 0x0, 0xffffffffffffffff }; +#endif int main (void) { @@ -727,6 +739,107 @@ int main (void) CHECK(TEST_MSG, poly, 64, 1, PRIx64, vst1_lane_expected, ""); CHECK(TEST_MSG, poly, 64, 2, PRIx64, vst1_lane_expected, ""); + /* vget_lane_p64 tests. */ +#undef TEST_MSG +#define TEST_MSG "VGET_LANE/VGETQ_LANE" + +#define TEST_VGET_LANE(Q, T1, T2, W, N, L) \ + VECT_VAR(vget_lane_vector, T1, W, N) = vget##Q##_lane_##T2##W(VECT_VAR(vget_lane_vector1, T1, W, N), L); \ + if (VECT_VAR(vget_lane_vector, T1, W, N) != VECT_VAR(vget_lane_expected, T1, W, N)) { \ + fprintf(stderr, \ + "ERROR in %s (%s line %d in result '%s') at type %s " \ + "got 0x%" PRIx##W " != 0x%" PRIx##W "\n", \ + TEST_MSG, __FILE__, __LINE__, \ + STR(VECT_VAR(vget_lane_expected, T1, W, N)), \ + STR(VECT_NAME(T1, W, N)), \ + VECT_VAR(vget_lane_vector, T1, W, N), \ + VECT_VAR(vget_lane_expected, T1, W, N)); \ + abort (); \ + } + + /* Initialize input values. */ + DECL_VARIABLE(vget_lane_vector1, poly, 64, 1); + DECL_VARIABLE(vget_lane_vector1, poly, 64, 2); + + VLOAD(vget_lane_vector1, buffer, , poly, p, 64, 1); + VLOAD(vget_lane_vector1, buffer, q, poly, p, 64, 2); + + VECT_VAR_DECL(vget_lane_vector, poly, 64, 1); + VECT_VAR_DECL(vget_lane_vector, poly, 64, 2); + + TEST_VGET_LANE( , poly, p, 64, 1, 0); + TEST_VGET_LANE(q, poly, p, 64, 2, 0); + + + /* vset_lane_p64 tests. */ +#undef TEST_MSG +#define TEST_MSG "VSET_LANE/VSETQ_LANE" + +#define TEST_VSET_LANE(Q, T1, T2, W, N, V, L) \ + VECT_VAR(vset_lane_vector, T1, W, N) = \ + vset##Q##_lane_##T2##W(V, \ + VECT_VAR(vset_lane_vector, T1, W, N), \ + L); \ + vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N), VECT_VAR(vset_lane_vector, T1, W, N)) + + /* Initialize input values. */ + DECL_VARIABLE(vset_lane_vector, poly, 64, 1); + DECL_VARIABLE(vset_lane_vector, poly, 64, 2); + + CLEAN(result, uint, 64, 1); + CLEAN(result, uint, 64, 2); + + VLOAD(vset_lane_vector, buffer, , poly, p, 64, 1); + VLOAD(vset_lane_vector, buffer, q, poly, p, 64, 2); + + /* Choose value and lane arbitrarily. */ + TEST_VSET_LANE(, poly, p, 64, 1, 0x88, 0); + TEST_VSET_LANE(q, poly, p, 64, 2, 0x11, 1); + + CHECK(TEST_MSG, poly, 64, 1, PRIx64, vset_lane_expected, ""); + CHECK(TEST_MSG, poly, 64, 2, PRIx64, vset_lane_expected, ""); + + + /* vtst_p64 tests. */ +#undef TEST_MSG +#define TEST_MSG "VTST" + +#define TEST_VTST1(INSN, Q, T1, T2, W, N) \ + VECT_VAR(vtst_vector_res, uint, W, N) = \ + INSN##Q##_##T2##W(VECT_VAR(vtst_vector, T1, W, N), \ + VECT_VAR(vtst_vector2, T1, W, N)); \ + vst1##Q##_u##W(VECT_VAR(result, uint, W, N), \ + VECT_VAR(vtst_vector_res, uint, W, N)) + +#define TEST_VTST(INSN, Q, T1, T2, W, N) \ + TEST_VTST1(INSN, Q, T1, T2, W, N) \ + + /* Initialize input values. */ + DECL_VARIABLE(vtst_vector, poly, 64, 1); + DECL_VARIABLE(vtst_vector2, poly, 64, 1); + DECL_VARIABLE(vtst_vector_res, uint, 64, 1); + + CLEAN(result, uint, 64, 1); + + VLOAD(vtst_vector, buffer, , poly, p, 64, 1); + VDUP(vtst_vector2, , poly, p, 64, 1, 5); + + TEST_VTST(vtst, , poly, p, 64, 1); + + CHECK(TEST_MSG, uint, 64, 1, PRIx64, vtst_expected, ""); + + /* vtstq_p64 is supported by aarch64 only. */ +#ifdef __aarch64__ + DECL_VARIABLE(vtst_vector, poly, 64, 2); + DECL_VARIABLE(vtst_vector2, poly, 64, 2); + DECL_VARIABLE(vtst_vector_res, uint, 64, 2); + CLEAN(result, uint, 64, 2); + VLOAD(vtst_vector, buffer, q, poly, p, 64, 2); + VDUP(vtst_vector2, q, poly, p, 64, 2, 5); + TEST_VTST(vtst, q, poly, p, 64, 2); + CHECK(TEST_MSG, uint, 64, 2, PRIx64, vtst_expected, ""); +#endif + #ifdef __aarch64__ /* vmov_n_p64 tests. */ @@ -767,37 +880,6 @@ int main (void) } } - /* vget_lane_p64 tests. */ -#undef TEST_MSG -#define TEST_MSG "VGET_LANE/VGETQ_LANE" - -#define TEST_VGET_LANE(Q, T1, T2, W, N, L) \ - VECT_VAR(vget_lane_vector, T1, W, N) = vget##Q##_lane_##T2##W(VECT_VAR(vector, T1, W, N), L); \ - if (VECT_VAR(vget_lane_vector, T1, W, N) != VECT_VAR(vget_lane_expected, T1, W, N)) { \ - fprintf(stderr, \ - "ERROR in %s (%s line %d in result '%s') at type %s " \ - "got 0x%" PRIx##W " != 0x%" PRIx##W "\n", \ - TEST_MSG, __FILE__, __LINE__, \ - STR(VECT_VAR(vget_lane_expected, T1, W, N)), \ - STR(VECT_NAME(T1, W, N)), \ - VECT_VAR(vget_lane_vector, T1, W, N), \ - VECT_VAR(vget_lane_expected, T1, W, N)); \ - abort (); \ - } - - /* Initialize input values. */ - DECL_VARIABLE(vector, poly, 64, 1); - DECL_VARIABLE(vector, poly, 64, 2); - - VLOAD(vector, buffer, , poly, p, 64, 1); - VLOAD(vector, buffer, q, poly, p, 64, 2); - - VECT_VAR_DECL(vget_lane_vector, poly, 64, 1); - VECT_VAR_DECL(vget_lane_vector, poly, 64, 2); - - TEST_VGET_LANE( , poly, p, 64, 1, 0); - TEST_VGET_LANE(q, poly, p, 64, 2, 0); - /* vldx_lane_p64 tests. */ #undef TEST_MSG #define TEST_MSG "VLDX_LANE/VLDXQ_LANE"