Message ID | 1370530985-20619-6-git-send-email-ard.biesheuvel@linaro.org |
---|---|
State | Accepted |
Commit | 7d11965ddb9b9b1e0a5d13c58345ada1ccbc663b |
Headers | show |
On Thu, 6 Jun 2013, Ard Biesheuvel wrote: > Rebased/reworked a patch contributed by Rob Herring that uses > NEON intrinsics to perform the RAID-6 syndrome calculations. > It uses the existing unroll.awk code to generate several > unrolled versions of which the best performing one is selected > at boot time. > > Output captured from an ARM Cortex-A15 @ 1.7 GHz: > > raid6: int32x1 200 MB/s > raid6: int32x2 304 MB/s > raid6: int32x4 388 MB/s > raid6: int32x8 423 MB/s > raid6: neonx1 799 MB/s > raid6: neonx2 1364 MB/s > raid6: neonx4 1731 MB/s > raid6: neonx8 1676 MB/s > raid6: using algorithm neonx4 (1731 MB/s) > raid6: using intx1 recovery algorithm > > Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Acked-by: Nicolas Pitre <nico@linaro.org> > --- > include/linux/raid/pq.h | 5 ++++ > lib/raid6/.gitignore | 1 + > lib/raid6/Makefile | 31 +++++++++++++++++++ > lib/raid6/algos.c | 6 ++++ > lib/raid6/neon.c | 58 +++++++++++++++++++++++++++++++++++ > lib/raid6/neon.uc | 80 +++++++++++++++++++++++++++++++++++++++++++++++++ > lib/raid6/test/Makefile | 19 +++++++++++- > 7 files changed, 199 insertions(+), 1 deletion(-) > create mode 100644 lib/raid6/neon.c > create mode 100644 lib/raid6/neon.uc > > diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h > index 8dfaa2c..0f42469 100644 > --- a/include/linux/raid/pq.h > +++ b/include/linux/raid/pq.h > @@ -114,6 +114,11 @@ extern const struct raid6_recov_calls raid6_recov_intx1; > extern const struct raid6_recov_calls raid6_recov_ssse3; > extern const struct raid6_recov_calls raid6_recov_avx2; > > +extern const struct raid6_calls raid6_neonx1; > +extern const struct raid6_calls raid6_neonx2; > +extern const struct raid6_calls raid6_neonx4; > +extern const struct raid6_calls raid6_neonx8; > + > /* Algorithm list */ > extern const struct raid6_calls * const raid6_algos[]; > extern const struct raid6_recov_calls *const raid6_recov_algos[]; > diff --git a/lib/raid6/.gitignore b/lib/raid6/.gitignore > index 162beca..0a7e494 100644 > --- a/lib/raid6/.gitignore > +++ b/lib/raid6/.gitignore > @@ -2,3 +2,4 @@ mktables > altivec*.c > int*.c > tables.c > +neon?.c > diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile > index 9f7c184..6a51f1a 100644 > --- a/lib/raid6/Makefile > +++ b/lib/raid6/Makefile > @@ -5,6 +5,7 @@ raid6_pq-y += algos.o recov.o tables.o int1.o int2.o int4.o \ > > raid6_pq-$(CONFIG_X86) += recov_ssse3.o recov_avx2.o mmx.o sse1.o sse2.o avx2.o > raid6_pq-$(CONFIG_ALTIVEC) += altivec1.o altivec2.o altivec4.o altivec8.o > +raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += neon.o neon1.o neon2.o neon4.o neon8.o > > hostprogs-y += mktables > > @@ -16,6 +17,12 @@ ifeq ($(CONFIG_ALTIVEC),y) > altivec_flags := -maltivec -mabi=altivec > endif > > +# The GCC option -ffreestanding is required in order to compile code containing > +# ARM/NEON intrinsics in a non C99-compliant environment (such as the kernel) > +ifeq ($(CONFIG_KERNEL_MODE_NEON),y) > +NEON_FLAGS := -mfloat-abi=softfp -mfpu=neon -ffreestanding > +endif > + > targets += int1.c > $(obj)/int1.c: UNROLL := 1 > $(obj)/int1.c: $(src)/int.uc $(src)/unroll.awk FORCE > @@ -70,6 +77,30 @@ $(obj)/altivec8.c: UNROLL := 8 > $(obj)/altivec8.c: $(src)/altivec.uc $(src)/unroll.awk FORCE > $(call if_changed,unroll) > > +CFLAGS_neon1.o += $(NEON_FLAGS) > +targets += neon1.c > +$(obj)/neon1.c: UNROLL := 1 > +$(obj)/neon1.c: $(src)/neon.uc $(src)/unroll.awk FORCE > + $(call if_changed,unroll) > + > +CFLAGS_neon2.o += $(NEON_FLAGS) > +targets += neon2.c > +$(obj)/neon2.c: UNROLL := 2 > +$(obj)/neon2.c: $(src)/neon.uc $(src)/unroll.awk FORCE > + $(call if_changed,unroll) > + > +CFLAGS_neon4.o += $(NEON_FLAGS) > +targets += neon4.c > +$(obj)/neon4.c: UNROLL := 4 > +$(obj)/neon4.c: $(src)/neon.uc $(src)/unroll.awk FORCE > + $(call if_changed,unroll) > + > +CFLAGS_neon8.o += $(NEON_FLAGS) > +targets += neon8.c > +$(obj)/neon8.c: UNROLL := 8 > +$(obj)/neon8.c: $(src)/neon.uc $(src)/unroll.awk FORCE > + $(call if_changed,unroll) > + > quiet_cmd_mktable = TABLE $@ > cmd_mktable = $(obj)/mktables > $@ || ( rm -f $@ && exit 1 ) > > diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c > index 6d7316f..74e6f56 100644 > --- a/lib/raid6/algos.c > +++ b/lib/raid6/algos.c > @@ -70,6 +70,12 @@ const struct raid6_calls * const raid6_algos[] = { > &raid6_intx2, > &raid6_intx4, > &raid6_intx8, > +#ifdef CONFIG_KERNEL_MODE_NEON > + &raid6_neonx1, > + &raid6_neonx2, > + &raid6_neonx4, > + &raid6_neonx8, > +#endif > NULL > }; > > diff --git a/lib/raid6/neon.c b/lib/raid6/neon.c > new file mode 100644 > index 0000000..dad7102 > --- /dev/null > +++ b/lib/raid6/neon.c > @@ -0,0 +1,58 @@ > +/* > + * linux/lib/raid6/neon.c - RAID6 syndrome calculation using ARM NEON intrinsics > + * > + * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org> > + * > + * This program is free software; you can redistribute it and/or modify > + * it under the terms of the GNU General Public License version 2 as > + * published by the Free Software Foundation. > + */ > + > +#include <linux/raid/pq.h> > + > +#ifdef __KERNEL__ > +#include <asm/neon.h> > +#else > +#define kernel_neon_begin() > +#define kernel_neon_end() > +#define cpu_has_neon() (1) > +#endif > + > +/* > + * There are 2 reasons these wrappers are kept in a separate compilation unit > + * from the actual implementations in neonN.c (generated from neon.uc by > + * unroll.awk): > + * - the actual implementations use NEON intrinsics, and the GCC support header > + * (arm_neon.h) is not fully compatible (type wise) with the kernel; > + * - the neonN.c files are compiled with -mfpu=neon and optimization enabled, > + * and we have to make sure that we never use *any* NEON/VFP instructions > + * outside a kernel_neon_begin()/kernel_neon_end() pair. > + */ > + > +#define RAID6_NEON_WRAPPER(_n) \ > + static void raid6_neon ## _n ## _gen_syndrome(int disks, \ > + size_t bytes, void **ptrs) \ > + { \ > + void raid6_neon ## _n ## _gen_syndrome_real(int, \ > + unsigned int, void**); \ > + kernel_neon_begin(); \ > + raid6_neon ## _n ## _gen_syndrome_real(disks, \ > + (unsigned int)bytes, ptrs); \ > + kernel_neon_end(); \ > + } \ > + struct raid6_calls const raid6_neonx ## _n = { \ > + raid6_neon ## _n ## _gen_syndrome, \ > + raid6_have_neon, \ > + "neonx" #_n, \ > + 0 \ > + }; > + > +static int raid6_have_neon(void) > +{ > + return cpu_has_neon(); > +} > + > +RAID6_NEON_WRAPPER(1) > +RAID6_NEON_WRAPPER(2) > +RAID6_NEON_WRAPPER(4) > +RAID6_NEON_WRAPPER(8) > diff --git a/lib/raid6/neon.uc b/lib/raid6/neon.uc > new file mode 100644 > index 0000000..f2d7ec0 > --- /dev/null > +++ b/lib/raid6/neon.uc > @@ -0,0 +1,80 @@ > +/* ----------------------------------------------------------------------- > + * > + * neon.uc - RAID-6 syndrome calculation using ARM NEON instructions > + * > + * Copyright (C) 2012 Rob Herring > + * > + * Based on altivec.uc: > + * Copyright 2002-2004 H. Peter Anvin - All Rights Reserved > + * > + * This program is free software; you can redistribute it and/or modify > + * it under the terms of the GNU General Public License as published by > + * the Free Software Foundation, Inc., 53 Temple Place Ste 330, > + * Boston MA 02111-1307, USA; either version 2 of the License, or > + * (at your option) any later version; incorporated herein by reference. > + * > + * ----------------------------------------------------------------------- */ > + > +/* > + * neon$#.c > + * > + * $#-way unrolled NEON intrinsics math RAID-6 instruction set > + * > + * This file is postprocessed using unroll.awk > + */ > + > +#include <arm_neon.h> > + > +typedef uint8x16_t unative_t; > + > +#define NBYTES(x) ((unative_t){x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x}) > +#define NSIZE sizeof(unative_t) > + > +/* > + * The SHLBYTE() operation shifts each byte left by 1, *not* > + * rolling over into the next byte > + */ > +static inline unative_t SHLBYTE(unative_t v) > +{ > + return vshlq_n_u8(v, 1); > +} > + > +/* > + * The MASK() operation returns 0xFF in any byte for which the high > + * bit is 1, 0x00 for any byte for which the high bit is 0. > + */ > +static inline unative_t MASK(unative_t v) > +{ > + const uint8x16_t temp = NBYTES(0); > + return (unative_t)vcltq_s8((int8x16_t)v, (int8x16_t)temp); > +} > + > +void raid6_neon$#_gen_syndrome_real(int disks, unsigned int bytes, void **ptrs) > +{ > + uint8_t **dptr = (uint8_t **)ptrs; > + uint8_t *p, *q; > + int d, z, z0; > + > + register unative_t wd$$, wq$$, wp$$, w1$$, w2$$; > + const unative_t x1d = NBYTES(0x1d); > + > + z0 = disks - 3; /* Highest data disk */ > + p = dptr[z0+1]; /* XOR parity */ > + q = dptr[z0+2]; /* RS syndrome */ > + > + for ( d = 0 ; d < bytes ; d += NSIZE*$# ) { > + wq$$ = wp$$ = vld1q_u8(&dptr[z0][d+$$*NSIZE]); > + for ( z = z0-1 ; z >= 0 ; z-- ) { > + wd$$ = vld1q_u8(&dptr[z][d+$$*NSIZE]); > + wp$$ = veorq_u8(wp$$, wd$$); > + w2$$ = MASK(wq$$); > + w1$$ = SHLBYTE(wq$$); > + > + w2$$ = vandq_u8(w2$$, x1d); > + w1$$ = veorq_u8(w1$$, w2$$); > + wq$$ = veorq_u8(w1$$, wd$$); > + } > + vst1q_u8(&p[d+NSIZE*$$], wp$$); > + vst1q_u8(&q[d+NSIZE*$$], wq$$); > + } > +} > diff --git a/lib/raid6/test/Makefile b/lib/raid6/test/Makefile > index 087332d..bbe6450 100644 > --- a/lib/raid6/test/Makefile > +++ b/lib/raid6/test/Makefile > @@ -34,6 +34,11 @@ else > ifeq ($(HAS_ALTIVEC),yes) > OBJS += altivec1.o altivec2.o altivec4.o altivec8.o > endif > + ifeq ($(ARCH),arm) > + CFLAGS += -I../../../arch/arm/include -mfpu=neon \ > + -DCONFIG_KERNEL_MODE_NEON=1 > + OBJS += neon.o neon1.o neon2.o neon4.o neon8.o > + endif > endif > > .c.o: > @@ -55,6 +60,18 @@ raid6.a: $(OBJS) > raid6test: test.c raid6.a > $(CC) $(CFLAGS) -o raid6test $^ > > +neon1.c: neon.uc ../unroll.awk > + $(AWK) ../unroll.awk -vN=1 < neon.uc > $@ > + > +neon2.c: neon.uc ../unroll.awk > + $(AWK) ../unroll.awk -vN=2 < neon.uc > $@ > + > +neon4.c: neon.uc ../unroll.awk > + $(AWK) ../unroll.awk -vN=4 < neon.uc > $@ > + > +neon8.c: neon.uc ../unroll.awk > + $(AWK) ../unroll.awk -vN=8 < neon.uc > $@ > + > altivec1.c: altivec.uc ../unroll.awk > $(AWK) ../unroll.awk -vN=1 < altivec.uc > $@ > > @@ -89,7 +106,7 @@ tables.c: mktables > ./mktables > tables.c > > clean: > - rm -f *.o *.a mktables mktables.c *.uc int*.c altivec*.c tables.c raid6test > + rm -f *.o *.a mktables mktables.c *.uc int*.c altivec*.c neon*.c tables.c raid6test > > spotless: clean > rm -f *~ > -- > 1.8.1.2 >
diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h index 8dfaa2c..0f42469 100644 --- a/include/linux/raid/pq.h +++ b/include/linux/raid/pq.h @@ -114,6 +114,11 @@ extern const struct raid6_recov_calls raid6_recov_intx1; extern const struct raid6_recov_calls raid6_recov_ssse3; extern const struct raid6_recov_calls raid6_recov_avx2; +extern const struct raid6_calls raid6_neonx1; +extern const struct raid6_calls raid6_neonx2; +extern const struct raid6_calls raid6_neonx4; +extern const struct raid6_calls raid6_neonx8; + /* Algorithm list */ extern const struct raid6_calls * const raid6_algos[]; extern const struct raid6_recov_calls *const raid6_recov_algos[]; diff --git a/lib/raid6/.gitignore b/lib/raid6/.gitignore index 162beca..0a7e494 100644 --- a/lib/raid6/.gitignore +++ b/lib/raid6/.gitignore @@ -2,3 +2,4 @@ mktables altivec*.c int*.c tables.c +neon?.c diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile index 9f7c184..6a51f1a 100644 --- a/lib/raid6/Makefile +++ b/lib/raid6/Makefile @@ -5,6 +5,7 @@ raid6_pq-y += algos.o recov.o tables.o int1.o int2.o int4.o \ raid6_pq-$(CONFIG_X86) += recov_ssse3.o recov_avx2.o mmx.o sse1.o sse2.o avx2.o raid6_pq-$(CONFIG_ALTIVEC) += altivec1.o altivec2.o altivec4.o altivec8.o +raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += neon.o neon1.o neon2.o neon4.o neon8.o hostprogs-y += mktables @@ -16,6 +17,12 @@ ifeq ($(CONFIG_ALTIVEC),y) altivec_flags := -maltivec -mabi=altivec endif +# The GCC option -ffreestanding is required in order to compile code containing +# ARM/NEON intrinsics in a non C99-compliant environment (such as the kernel) +ifeq ($(CONFIG_KERNEL_MODE_NEON),y) +NEON_FLAGS := -mfloat-abi=softfp -mfpu=neon -ffreestanding +endif + targets += int1.c $(obj)/int1.c: UNROLL := 1 $(obj)/int1.c: $(src)/int.uc $(src)/unroll.awk FORCE @@ -70,6 +77,30 @@ $(obj)/altivec8.c: UNROLL := 8 $(obj)/altivec8.c: $(src)/altivec.uc $(src)/unroll.awk FORCE $(call if_changed,unroll) +CFLAGS_neon1.o += $(NEON_FLAGS) +targets += neon1.c +$(obj)/neon1.c: UNROLL := 1 +$(obj)/neon1.c: $(src)/neon.uc $(src)/unroll.awk FORCE + $(call if_changed,unroll) + +CFLAGS_neon2.o += $(NEON_FLAGS) +targets += neon2.c +$(obj)/neon2.c: UNROLL := 2 +$(obj)/neon2.c: $(src)/neon.uc $(src)/unroll.awk FORCE + $(call if_changed,unroll) + +CFLAGS_neon4.o += $(NEON_FLAGS) +targets += neon4.c +$(obj)/neon4.c: UNROLL := 4 +$(obj)/neon4.c: $(src)/neon.uc $(src)/unroll.awk FORCE + $(call if_changed,unroll) + +CFLAGS_neon8.o += $(NEON_FLAGS) +targets += neon8.c +$(obj)/neon8.c: UNROLL := 8 +$(obj)/neon8.c: $(src)/neon.uc $(src)/unroll.awk FORCE + $(call if_changed,unroll) + quiet_cmd_mktable = TABLE $@ cmd_mktable = $(obj)/mktables > $@ || ( rm -f $@ && exit 1 ) diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c index 6d7316f..74e6f56 100644 --- a/lib/raid6/algos.c +++ b/lib/raid6/algos.c @@ -70,6 +70,12 @@ const struct raid6_calls * const raid6_algos[] = { &raid6_intx2, &raid6_intx4, &raid6_intx8, +#ifdef CONFIG_KERNEL_MODE_NEON + &raid6_neonx1, + &raid6_neonx2, + &raid6_neonx4, + &raid6_neonx8, +#endif NULL }; diff --git a/lib/raid6/neon.c b/lib/raid6/neon.c new file mode 100644 index 0000000..dad7102 --- /dev/null +++ b/lib/raid6/neon.c @@ -0,0 +1,58 @@ +/* + * linux/lib/raid6/neon.c - RAID6 syndrome calculation using ARM NEON intrinsics + * + * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/raid/pq.h> + +#ifdef __KERNEL__ +#include <asm/neon.h> +#else +#define kernel_neon_begin() +#define kernel_neon_end() +#define cpu_has_neon() (1) +#endif + +/* + * There are 2 reasons these wrappers are kept in a separate compilation unit + * from the actual implementations in neonN.c (generated from neon.uc by + * unroll.awk): + * - the actual implementations use NEON intrinsics, and the GCC support header + * (arm_neon.h) is not fully compatible (type wise) with the kernel; + * - the neonN.c files are compiled with -mfpu=neon and optimization enabled, + * and we have to make sure that we never use *any* NEON/VFP instructions + * outside a kernel_neon_begin()/kernel_neon_end() pair. + */ + +#define RAID6_NEON_WRAPPER(_n) \ + static void raid6_neon ## _n ## _gen_syndrome(int disks, \ + size_t bytes, void **ptrs) \ + { \ + void raid6_neon ## _n ## _gen_syndrome_real(int, \ + unsigned int, void**); \ + kernel_neon_begin(); \ + raid6_neon ## _n ## _gen_syndrome_real(disks, \ + (unsigned int)bytes, ptrs); \ + kernel_neon_end(); \ + } \ + struct raid6_calls const raid6_neonx ## _n = { \ + raid6_neon ## _n ## _gen_syndrome, \ + raid6_have_neon, \ + "neonx" #_n, \ + 0 \ + }; + +static int raid6_have_neon(void) +{ + return cpu_has_neon(); +} + +RAID6_NEON_WRAPPER(1) +RAID6_NEON_WRAPPER(2) +RAID6_NEON_WRAPPER(4) +RAID6_NEON_WRAPPER(8) diff --git a/lib/raid6/neon.uc b/lib/raid6/neon.uc new file mode 100644 index 0000000..f2d7ec0 --- /dev/null +++ b/lib/raid6/neon.uc @@ -0,0 +1,80 @@ +/* ----------------------------------------------------------------------- + * + * neon.uc - RAID-6 syndrome calculation using ARM NEON instructions + * + * Copyright (C) 2012 Rob Herring + * + * Based on altivec.uc: + * Copyright 2002-2004 H. Peter Anvin - All Rights Reserved + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 53 Temple Place Ste 330, + * Boston MA 02111-1307, USA; either version 2 of the License, or + * (at your option) any later version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +/* + * neon$#.c + * + * $#-way unrolled NEON intrinsics math RAID-6 instruction set + * + * This file is postprocessed using unroll.awk + */ + +#include <arm_neon.h> + +typedef uint8x16_t unative_t; + +#define NBYTES(x) ((unative_t){x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x}) +#define NSIZE sizeof(unative_t) + +/* + * The SHLBYTE() operation shifts each byte left by 1, *not* + * rolling over into the next byte + */ +static inline unative_t SHLBYTE(unative_t v) +{ + return vshlq_n_u8(v, 1); +} + +/* + * The MASK() operation returns 0xFF in any byte for which the high + * bit is 1, 0x00 for any byte for which the high bit is 0. + */ +static inline unative_t MASK(unative_t v) +{ + const uint8x16_t temp = NBYTES(0); + return (unative_t)vcltq_s8((int8x16_t)v, (int8x16_t)temp); +} + +void raid6_neon$#_gen_syndrome_real(int disks, unsigned int bytes, void **ptrs) +{ + uint8_t **dptr = (uint8_t **)ptrs; + uint8_t *p, *q; + int d, z, z0; + + register unative_t wd$$, wq$$, wp$$, w1$$, w2$$; + const unative_t x1d = NBYTES(0x1d); + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0+1]; /* XOR parity */ + q = dptr[z0+2]; /* RS syndrome */ + + for ( d = 0 ; d < bytes ; d += NSIZE*$# ) { + wq$$ = wp$$ = vld1q_u8(&dptr[z0][d+$$*NSIZE]); + for ( z = z0-1 ; z >= 0 ; z-- ) { + wd$$ = vld1q_u8(&dptr[z][d+$$*NSIZE]); + wp$$ = veorq_u8(wp$$, wd$$); + w2$$ = MASK(wq$$); + w1$$ = SHLBYTE(wq$$); + + w2$$ = vandq_u8(w2$$, x1d); + w1$$ = veorq_u8(w1$$, w2$$); + wq$$ = veorq_u8(w1$$, wd$$); + } + vst1q_u8(&p[d+NSIZE*$$], wp$$); + vst1q_u8(&q[d+NSIZE*$$], wq$$); + } +} diff --git a/lib/raid6/test/Makefile b/lib/raid6/test/Makefile index 087332d..bbe6450 100644 --- a/lib/raid6/test/Makefile +++ b/lib/raid6/test/Makefile @@ -34,6 +34,11 @@ else ifeq ($(HAS_ALTIVEC),yes) OBJS += altivec1.o altivec2.o altivec4.o altivec8.o endif + ifeq ($(ARCH),arm) + CFLAGS += -I../../../arch/arm/include -mfpu=neon \ + -DCONFIG_KERNEL_MODE_NEON=1 + OBJS += neon.o neon1.o neon2.o neon4.o neon8.o + endif endif .c.o: @@ -55,6 +60,18 @@ raid6.a: $(OBJS) raid6test: test.c raid6.a $(CC) $(CFLAGS) -o raid6test $^ +neon1.c: neon.uc ../unroll.awk + $(AWK) ../unroll.awk -vN=1 < neon.uc > $@ + +neon2.c: neon.uc ../unroll.awk + $(AWK) ../unroll.awk -vN=2 < neon.uc > $@ + +neon4.c: neon.uc ../unroll.awk + $(AWK) ../unroll.awk -vN=4 < neon.uc > $@ + +neon8.c: neon.uc ../unroll.awk + $(AWK) ../unroll.awk -vN=8 < neon.uc > $@ + altivec1.c: altivec.uc ../unroll.awk $(AWK) ../unroll.awk -vN=1 < altivec.uc > $@ @@ -89,7 +106,7 @@ tables.c: mktables ./mktables > tables.c clean: - rm -f *.o *.a mktables mktables.c *.uc int*.c altivec*.c tables.c raid6test + rm -f *.o *.a mktables mktables.c *.uc int*.c altivec*.c neon*.c tables.c raid6test spotless: clean rm -f *~
Rebased/reworked a patch contributed by Rob Herring that uses NEON intrinsics to perform the RAID-6 syndrome calculations. It uses the existing unroll.awk code to generate several unrolled versions of which the best performing one is selected at boot time. Output captured from an ARM Cortex-A15 @ 1.7 GHz: raid6: int32x1 200 MB/s raid6: int32x2 304 MB/s raid6: int32x4 388 MB/s raid6: int32x8 423 MB/s raid6: neonx1 799 MB/s raid6: neonx2 1364 MB/s raid6: neonx4 1731 MB/s raid6: neonx8 1676 MB/s raid6: using algorithm neonx4 (1731 MB/s) raid6: using intx1 recovery algorithm Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> --- include/linux/raid/pq.h | 5 ++++ lib/raid6/.gitignore | 1 + lib/raid6/Makefile | 31 +++++++++++++++++++ lib/raid6/algos.c | 6 ++++ lib/raid6/neon.c | 58 +++++++++++++++++++++++++++++++++++ lib/raid6/neon.uc | 80 +++++++++++++++++++++++++++++++++++++++++++++++++ lib/raid6/test/Makefile | 19 +++++++++++- 7 files changed, 199 insertions(+), 1 deletion(-) create mode 100644 lib/raid6/neon.c create mode 100644 lib/raid6/neon.uc