Message ID | 1398427560-31219-1-git-send-email-will.newton@linaro.org |
---|---|
State | Accepted |
Headers | show |
On 25 April 2014 13:06, Will Newton <will.newton@linaro.org> wrote: > Add an optimized implementation of strcmp for ARMv7-A cores. This > implementation is significantly faster than the current generic C > implementation, particularly for strings of 16 bytes and longer. > > Tested with the glibc string tests for arm-linux-gnueabihf and > armeb-linux-gnueabihf. > > The code was written by ARM, who have agreed to assign the copyright > to the FSF for integration into glibc. > > ChangeLog: > > 2014-04-25 Will Newton <will.newton@linaro.org> > > * sysdeps/arm/armv7/strcmp.S: New file. > * NEWS: Mention addition of ARMv7 optimized strcmp. > --- > NEWS | 2 + > sysdeps/arm/armv7/strcmp.S | 491 +++++++++++++++++++++++++++++++++++++++++++++ > 2 files changed, 493 insertions(+) > create mode 100644 sysdeps/arm/armv7/strcmp.S Ping? > Changes in v2: > - Add a NEWS entry > - Improve CFI annotations based on suggestions by Richard Henderson > - Rename STRCMP_NO_PRECHECK to STRCMP_PRECHECK > > diff --git a/NEWS b/NEWS > index a8a6ea8..2fe2bfc 100644 > --- a/NEWS > +++ b/NEWS > @@ -34,6 +34,8 @@ Version 2.20 > interfaces those macros enabled remain available when compiling with > _GNU_SOURCE defined, with _DEFAULT_SOURCE defined, or without any feature > test macros defined. > + > +* Optimized strcmp implementation for ARMv7. Contributed by ARM Ltd. > > Version 2.19 > > diff --git a/sysdeps/arm/armv7/strcmp.S b/sysdeps/arm/armv7/strcmp.S > new file mode 100644 > index 0000000..02a5c7c > --- /dev/null > +++ b/sysdeps/arm/armv7/strcmp.S > @@ -0,0 +1,491 @@ > +/* Copyright (C) 2012-2014 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library. If not, see > + <http://www.gnu.org/licenses/>. */ > + > +#include <arm-features.h> > +#include <sysdep.h> > + > +/* Implementation of strcmp for ARMv7 when DSP instructions are > + available. Use ldrd to support wider loads, provided the data > + is sufficiently aligned. Use saturating arithmetic to optimize > + the compares. */ > + > +/* Build Options: > + STRCMP_PRECHECK: Run a quick pre-check of the first byte in the > + string. If comparing completely random strings the pre-check will > + save time, since there is a very high probability of a mismatch in > + the first character: we save significant overhead if this is the > + common case. However, if strings are likely to be identical (e.g. > + because we're verifying a hit in a hash table), then this check > + is largely redundant. */ > + > +#define STRCMP_PRECHECK 1 > + > + /* This version uses Thumb-2 code. */ > + .thumb > + .syntax unified > + > +#ifdef __ARM_BIG_ENDIAN > +#define S2LO lsl > +#define S2LOEQ lsleq > +#define S2HI lsr > +#define MSB 0x000000ff > +#define LSB 0xff000000 > +#define BYTE0_OFFSET 24 > +#define BYTE1_OFFSET 16 > +#define BYTE2_OFFSET 8 > +#define BYTE3_OFFSET 0 > +#else /* not __ARM_BIG_ENDIAN */ > +#define S2LO lsr > +#define S2LOEQ lsreq > +#define S2HI lsl > +#define BYTE0_OFFSET 0 > +#define BYTE1_OFFSET 8 > +#define BYTE2_OFFSET 16 > +#define BYTE3_OFFSET 24 > +#define MSB 0xff000000 > +#define LSB 0x000000ff > +#endif /* not __ARM_BIG_ENDIAN */ > + > +/* Parameters and result. */ > +#define src1 r0 > +#define src2 r1 > +#define result r0 /* Overlaps src1. */ > + > +/* Internal variables. */ > +#define tmp1 r4 > +#define tmp2 r5 > +#define const_m1 r12 > + > +/* Additional internal variables for 64-bit aligned data. */ > +#define data1a r2 > +#define data1b r3 > +#define data2a r6 > +#define data2b r7 > +#define syndrome_a tmp1 > +#define syndrome_b tmp2 > + > +/* Additional internal variables for 32-bit aligned data. */ > +#define data1 r2 > +#define data2 r3 > +#define syndrome tmp2 > + > + > + /* Macro to compute and return the result value for word-aligned > + cases. */ > + .macro strcmp_epilogue_aligned synd d1 d2 restore_r6 > +#ifdef __ARM_BIG_ENDIAN > + /* If data1 contains a zero byte, then syndrome will contain a 1 in > + bit 7 of that byte. Otherwise, the highest set bit in the > + syndrome will highlight the first different bit. It is therefore > + sufficient to extract the eight bits starting with the syndrome > + bit. */ > + clz tmp1, \synd > + lsl r1, \d2, tmp1 > + .if \restore_r6 > + ldrd r6, r7, [sp, #8] > + .endif > + lsl \d1, \d1, tmp1 > + lsr result, \d1, #24 > + ldrd r4, r5, [sp], #16 > + cfi_remember_state > + cfi_def_cfa_offset (0) > + cfi_restore (r4) > + cfi_restore (r5) > + cfi_restore (r6) > + cfi_restore (r7) > + sub result, result, r1, lsr #24 > + bx lr > +#else > + /* To use the big-endian trick we'd have to reverse all three words. > + that's slower than this approach. */ > + rev \synd, \synd > + clz tmp1, \synd > + bic tmp1, tmp1, #7 > + lsr r1, \d2, tmp1 > + .if \restore_r6 > + ldrd r6, r7, [sp, #8] > + .endif > + lsr \d1, \d1, tmp1 > + and result, \d1, #255 > + and r1, r1, #255 > + ldrd r4, r5, [sp], #16 > + cfi_remember_state > + cfi_def_cfa_offset (0) > + cfi_restore (r4) > + cfi_restore (r5) > + cfi_restore (r6) > + cfi_restore (r7) > + sub result, result, r1 > + > + bx lr > +#endif > + .endm > + > + .text > + .p2align 5 > +.Lstrcmp_start_addr: > +#if STRCMP_PRECHECK == 1 > +.Lfastpath_exit: > + sub r0, r2, r3 > + bx lr > + nop > +#endif > +ENTRY(strcmp) > +#if STRCMP_PRECHECK == 1 > + ldrb r2, [src1] > + ldrb r3, [src2] > + cmp r2, #1 > + it cs > + cmpcs r2, r3 > + bne .Lfastpath_exit > +#endif > + strd r4, r5, [sp, #-16]! > + cfi_def_cfa_offset (16) > + cfi_offset (r4, -16) > + cfi_offset (r5, -12) > + orr tmp1, src1, src2 > + strd r6, r7, [sp, #8] > + cfi_offset (r6, -8) > + cfi_offset (r7, -4) > + mvn const_m1, #0 > + lsl r2, tmp1, #29 > + cbz r2, .Lloop_aligned8 > + > +.Lnot_aligned: > + eor tmp1, src1, src2 > + tst tmp1, #7 > + bne .Lmisaligned8 > + > + /* Deal with mutual misalignment by aligning downwards and then > + masking off the unwanted loaded data to prevent a difference. */ > + and tmp1, src1, #7 > + bic src1, src1, #7 > + and tmp2, tmp1, #3 > + bic src2, src2, #7 > + lsl tmp2, tmp2, #3 /* Bytes -> bits. */ > + ldrd data1a, data1b, [src1], #16 > + tst tmp1, #4 > + ldrd data2a, data2b, [src2], #16 > + /* In thumb code we can't use MVN with a register shift, but > + we do have ORN. */ > + S2HI tmp1, const_m1, tmp2 > + orn data1a, data1a, tmp1 > + orn data2a, data2a, tmp1 > + beq .Lstart_realigned8 > + orn data1b, data1b, tmp1 > + mov data1a, const_m1 > + orn data2b, data2b, tmp1 > + mov data2a, const_m1 > + b .Lstart_realigned8 > + > + /* Unwind the inner loop by a factor of 2, giving 16 bytes per > + pass. */ > + .p2align 5,,12 /* Don't start in the tail bytes of a cache line. */ > + .p2align 2 /* Always word aligned. */ > +.Lloop_aligned8: > + ldrd data1a, data1b, [src1], #16 > + ldrd data2a, data2b, [src2], #16 > +.Lstart_realigned8: > + uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */ > + eor syndrome_a, data1a, data2a > + sel syndrome_a, syndrome_a, const_m1 > + cbnz syndrome_a, .Ldiff_in_a > + uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */ > + eor syndrome_b, data1b, data2b > + sel syndrome_b, syndrome_b, const_m1 > + cbnz syndrome_b, .Ldiff_in_b > + > + ldrd data1a, data1b, [src1, #-8] > + ldrd data2a, data2b, [src2, #-8] > + uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */ > + eor syndrome_a, data1a, data2a > + sel syndrome_a, syndrome_a, const_m1 > + uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */ > + eor syndrome_b, data1b, data2b > + sel syndrome_b, syndrome_b, const_m1 > + /* Can't use CBZ for backwards branch. */ > + orrs syndrome_b, syndrome_b, syndrome_a /* Only need if s_a == 0 */ > + beq .Lloop_aligned8 > + > +.Ldiff_found: > + cbnz syndrome_a, .Ldiff_in_a > + > +.Ldiff_in_b: > + strcmp_epilogue_aligned syndrome_b, data1b, data2b 1 > + > +.Ldiff_in_a: > + cfi_restore_state > + strcmp_epilogue_aligned syndrome_a, data1a, data2a 1 > + > + cfi_restore_state > +.Lmisaligned8: > + tst tmp1, #3 > + bne .Lmisaligned4 > + ands tmp1, src1, #3 > + bne .Lmutual_align4 > + > + /* Unrolled by a factor of 2, to reduce the number of post-increment > + operations. */ > +.Lloop_aligned4: > + ldr data1, [src1], #8 > + ldr data2, [src2], #8 > +.Lstart_realigned4: > + uadd8 syndrome, data1, const_m1 /* Only need GE bits. */ > + eor syndrome, data1, data2 > + sel syndrome, syndrome, const_m1 > + cbnz syndrome, .Laligned4_done > + ldr data1, [src1, #-4] > + ldr data2, [src2, #-4] > + uadd8 syndrome, data1, const_m1 > + eor syndrome, data1, data2 > + sel syndrome, syndrome, const_m1 > + cmp syndrome, #0 > + beq .Lloop_aligned4 > + > +.Laligned4_done: > + strcmp_epilogue_aligned syndrome, data1, data2, 0 > + > +.Lmutual_align4: > + cfi_restore_state > + /* Deal with mutual misalignment by aligning downwards and then > + masking off the unwanted loaded data to prevent a difference. */ > + lsl tmp1, tmp1, #3 /* Bytes -> bits. */ > + bic src1, src1, #3 > + ldr data1, [src1], #8 > + bic src2, src2, #3 > + ldr data2, [src2], #8 > + > + /* In thumb code we can't use MVN with a register shift, but > + we do have ORN. */ > + S2HI tmp1, const_m1, tmp1 > + orn data1, data1, tmp1 > + orn data2, data2, tmp1 > + b .Lstart_realigned4 > + > +.Lmisaligned4: > + ands tmp1, src1, #3 > + beq .Lsrc1_aligned > + sub src2, src2, tmp1 > + bic src1, src1, #3 > + lsls tmp1, tmp1, #31 > + ldr data1, [src1], #4 > + beq .Laligned_m2 > + bcs .Laligned_m1 > + > +#if STRCMP_PRECHECK == 0 > + ldrb data2, [src2, #1] > + uxtb tmp1, data1, ror #BYTE1_OFFSET > + subs tmp1, tmp1, data2 > + bne .Lmisaligned_exit > + cbz data2, .Lmisaligned_exit > + > +.Laligned_m2: > + ldrb data2, [src2, #2] > + uxtb tmp1, data1, ror #BYTE2_OFFSET > + subs tmp1, tmp1, data2 > + bne .Lmisaligned_exit > + cbz data2, .Lmisaligned_exit > + > +.Laligned_m1: > + ldrb data2, [src2, #3] > + uxtb tmp1, data1, ror #BYTE3_OFFSET > + subs tmp1, tmp1, data2 > + bne .Lmisaligned_exit > + add src2, src2, #4 > + cbnz data2, .Lsrc1_aligned > +#else /* STRCMP_PRECHECK */ > + /* If we've done the pre-check, then we don't need to check the > + first byte again here. */ > + ldrb data2, [src2, #2] > + uxtb tmp1, data1, ror #BYTE2_OFFSET > + subs tmp1, tmp1, data2 > + bne .Lmisaligned_exit > + cbz data2, .Lmisaligned_exit > + > +.Laligned_m2: > + ldrb data2, [src2, #3] > + uxtb tmp1, data1, ror #BYTE3_OFFSET > + subs tmp1, tmp1, data2 > + bne .Lmisaligned_exit > + cbnz data2, .Laligned_m1 > +#endif > + > +.Lmisaligned_exit: > + mov result, tmp1 > + ldr r4, [sp], #16 > + cfi_remember_state > + cfi_def_cfa_offset (0) > + cfi_restore (r4) > + cfi_restore (r5) > + cfi_restore (r6) > + cfi_restore (r7) > + bx lr > + > +#if STRCMP_PRECHECK == 1 > +.Laligned_m1: > + add src2, src2, #4 > +#endif > +.Lsrc1_aligned: > + cfi_restore_state > + /* src1 is word aligned, but src2 has no common alignment > + with it. */ > + ldr data1, [src1], #4 > + lsls tmp1, src2, #31 /* C=src2[1], Z=src2[0]. */ > + > + bic src2, src2, #3 > + ldr data2, [src2], #4 > + bhi .Loverlap1 /* C=1, Z=0 => src2[1:0] = 0b11. */ > + bcs .Loverlap2 /* C=1, Z=1 => src2[1:0] = 0b10. */ > + > + /* (overlap3) C=0, Z=0 => src2[1:0] = 0b01. */ > +.Loverlap3: > + bic tmp1, data1, #MSB > + uadd8 syndrome, data1, const_m1 > + eors syndrome, tmp1, data2, S2LO #8 > + sel syndrome, syndrome, const_m1 > + bne 4f > + cbnz syndrome, 5f > + ldr data2, [src2], #4 > + eor tmp1, tmp1, data1 > + cmp tmp1, data2, S2HI #24 > + bne 6f > + ldr data1, [src1], #4 > + b .Loverlap3 > +4: > + S2LO data2, data2, #8 > + b .Lstrcmp_tail > + > +5: > + bics syndrome, syndrome, #MSB > + bne .Lstrcmp_done_equal > + > + /* We can only get here if the MSB of data1 contains 0, so > + fast-path the exit. */ > + ldrb result, [src2] > + ldrd r4, r5, [sp], #16 > + cfi_remember_state > + cfi_def_cfa_offset (0) > + cfi_restore (r4) > + cfi_restore (r5) > + /* R6/7 Not used in this sequence. */ > + cfi_restore (r6) > + cfi_restore (r7) > + neg result, result > + bx lr > + > +6: > + cfi_restore_state > + S2LO data1, data1, #24 > + and data2, data2, #LSB > + b .Lstrcmp_tail > + > + .p2align 5,,12 /* Ensure at least 3 instructions in cache line. */ > +.Loverlap2: > + and tmp1, data1, const_m1, S2LO #16 > + uadd8 syndrome, data1, const_m1 > + eors syndrome, tmp1, data2, S2LO #16 > + sel syndrome, syndrome, const_m1 > + bne 4f > + cbnz syndrome, 5f > + ldr data2, [src2], #4 > + eor tmp1, tmp1, data1 > + cmp tmp1, data2, S2HI #16 > + bne 6f > + ldr data1, [src1], #4 > + b .Loverlap2 > +4: > + S2LO data2, data2, #16 > + b .Lstrcmp_tail > +5: > + ands syndrome, syndrome, const_m1, S2LO #16 > + bne .Lstrcmp_done_equal > + > + ldrh data2, [src2] > + S2LO data1, data1, #16 > +#ifdef __ARM_BIG_ENDIAN > + lsl data2, data2, #16 > +#endif > + b .Lstrcmp_tail > + > +6: > + S2LO data1, data1, #16 > + and data2, data2, const_m1, S2LO #16 > + b .Lstrcmp_tail > + > + .p2align 5,,12 /* Ensure at least 3 instructions in cache line. */ > +.Loverlap1: > + and tmp1, data1, #LSB > + uadd8 syndrome, data1, const_m1 > + eors syndrome, tmp1, data2, S2LO #24 > + sel syndrome, syndrome, const_m1 > + bne 4f > + cbnz syndrome, 5f > + ldr data2, [src2], #4 > + eor tmp1, tmp1, data1 > + cmp tmp1, data2, S2HI #8 > + bne 6f > + ldr data1, [src1], #4 > + b .Loverlap1 > +4: > + S2LO data2, data2, #24 > + b .Lstrcmp_tail > +5: > + tst syndrome, #LSB > + bne .Lstrcmp_done_equal > + ldr data2, [src2] > +6: > + S2LO data1, data1, #8 > + bic data2, data2, #MSB > + b .Lstrcmp_tail > + > +.Lstrcmp_done_equal: > + mov result, #0 > + ldrd r4, r5, [sp], #16 > + cfi_remember_state > + cfi_def_cfa_offset (0) > + cfi_restore (r4) > + cfi_restore (r5) > + /* R6/7 not used in this sequence. */ > + cfi_restore (r6) > + cfi_restore (r7) > + bx lr > + > +.Lstrcmp_tail: > + cfi_restore_state > +#ifndef __ARM_BIG_ENDIAN > + rev data1, data1 > + rev data2, data2 > + /* Now everything looks big-endian... */ > +#endif > + uadd8 tmp1, data1, const_m1 > + eor tmp1, data1, data2 > + sel syndrome, tmp1, const_m1 > + clz tmp1, syndrome > + lsl data1, data1, tmp1 > + lsl data2, data2, tmp1 > + lsr result, data1, #24 > + ldrd r4, r5, [sp], #16 > + cfi_def_cfa_offset (0) > + cfi_restore (r4) > + cfi_restore (r5) > + /* R6/7 not used in this sequence. */ > + cfi_restore (r6) > + cfi_restore (r7) > + sub result, result, data2, lsr #24 > + bx lr > +END(strcmp) > +libc_hidden_builtin_def (strcmp) > -- > 1.9.0 >
On Fri, 25 Apr 2014, Will Newton wrote: > diff --git a/sysdeps/arm/armv7/strcmp.S b/sysdeps/arm/armv7/strcmp.S > new file mode 100644 > index 0000000..02a5c7c > --- /dev/null > +++ b/sysdeps/arm/armv7/strcmp.S > @@ -0,0 +1,491 @@ > +/* Copyright (C) 2012-2014 Free Software Foundation, Inc. > + This file is part of the GNU C Library. The first line of any new file, before the copyright notice, should be a descriptive coment. > +#ifdef __ARM_BIG_ENDIAN > +#define S2LO lsl > +#define S2LOEQ lsleq > +#define S2HI lsr > +#define MSB 0x000000ff > +#define LSB 0xff000000 > +#define BYTE0_OFFSET 24 > +#define BYTE1_OFFSET 16 > +#define BYTE2_OFFSET 8 > +#define BYTE3_OFFSET 0 > +#else /* not __ARM_BIG_ENDIAN */ > +#define S2LO lsr > +#define S2LOEQ lsreq > +#define S2HI lsl > +#define BYTE0_OFFSET 0 > +#define BYTE1_OFFSET 8 > +#define BYTE2_OFFSET 16 > +#define BYTE3_OFFSET 24 > +#define MSB 0xff000000 > +#define LSB 0x000000ff > +#endif /* not __ARM_BIG_ENDIAN */ Use "# define" indentation inside #if. > +END(strcmp) Space before '('. OK with those changes.
On 8 May 2014 22:44, Joseph S. Myers <joseph@codesourcery.com> wrote: > On Fri, 25 Apr 2014, Will Newton wrote: > >> diff --git a/sysdeps/arm/armv7/strcmp.S b/sysdeps/arm/armv7/strcmp.S >> new file mode 100644 >> index 0000000..02a5c7c >> --- /dev/null >> +++ b/sysdeps/arm/armv7/strcmp.S >> @@ -0,0 +1,491 @@ >> +/* Copyright (C) 2012-2014 Free Software Foundation, Inc. >> + This file is part of the GNU C Library. > > The first line of any new file, before the copyright notice, should be a > descriptive coment. > >> +#ifdef __ARM_BIG_ENDIAN >> +#define S2LO lsl >> +#define S2LOEQ lsleq >> +#define S2HI lsr >> +#define MSB 0x000000ff >> +#define LSB 0xff000000 >> +#define BYTE0_OFFSET 24 >> +#define BYTE1_OFFSET 16 >> +#define BYTE2_OFFSET 8 >> +#define BYTE3_OFFSET 0 >> +#else /* not __ARM_BIG_ENDIAN */ >> +#define S2LO lsr >> +#define S2LOEQ lsreq >> +#define S2HI lsl >> +#define BYTE0_OFFSET 0 >> +#define BYTE1_OFFSET 8 >> +#define BYTE2_OFFSET 16 >> +#define BYTE3_OFFSET 24 >> +#define MSB 0xff000000 >> +#define LSB 0x000000ff >> +#endif /* not __ARM_BIG_ENDIAN */ > > Use "# define" indentation inside #if. > >> +END(strcmp) > > Space before '('. > > OK with those changes. Thanks. Committed with those changes.
diff --git a/NEWS b/NEWS index a8a6ea8..2fe2bfc 100644 --- a/NEWS +++ b/NEWS @@ -34,6 +34,8 @@ Version 2.20 interfaces those macros enabled remain available when compiling with _GNU_SOURCE defined, with _DEFAULT_SOURCE defined, or without any feature test macros defined. + +* Optimized strcmp implementation for ARMv7. Contributed by ARM Ltd. Version 2.19 diff --git a/sysdeps/arm/armv7/strcmp.S b/sysdeps/arm/armv7/strcmp.S new file mode 100644 index 0000000..02a5c7c --- /dev/null +++ b/sysdeps/arm/armv7/strcmp.S @@ -0,0 +1,491 @@ +/* Copyright (C) 2012-2014 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + <http://www.gnu.org/licenses/>. */ + +#include <arm-features.h> +#include <sysdep.h> + +/* Implementation of strcmp for ARMv7 when DSP instructions are + available. Use ldrd to support wider loads, provided the data + is sufficiently aligned. Use saturating arithmetic to optimize + the compares. */ + +/* Build Options: + STRCMP_PRECHECK: Run a quick pre-check of the first byte in the + string. If comparing completely random strings the pre-check will + save time, since there is a very high probability of a mismatch in + the first character: we save significant overhead if this is the + common case. However, if strings are likely to be identical (e.g. + because we're verifying a hit in a hash table), then this check + is largely redundant. */ + +#define STRCMP_PRECHECK 1 + + /* This version uses Thumb-2 code. */ + .thumb + .syntax unified + +#ifdef __ARM_BIG_ENDIAN +#define S2LO lsl +#define S2LOEQ lsleq +#define S2HI lsr +#define MSB 0x000000ff +#define LSB 0xff000000 +#define BYTE0_OFFSET 24 +#define BYTE1_OFFSET 16 +#define BYTE2_OFFSET 8 +#define BYTE3_OFFSET 0 +#else /* not __ARM_BIG_ENDIAN */ +#define S2LO lsr +#define S2LOEQ lsreq +#define S2HI lsl +#define BYTE0_OFFSET 0 +#define BYTE1_OFFSET 8 +#define BYTE2_OFFSET 16 +#define BYTE3_OFFSET 24 +#define MSB 0xff000000 +#define LSB 0x000000ff +#endif /* not __ARM_BIG_ENDIAN */ + +/* Parameters and result. */ +#define src1 r0 +#define src2 r1 +#define result r0 /* Overlaps src1. */ + +/* Internal variables. */ +#define tmp1 r4 +#define tmp2 r5 +#define const_m1 r12 + +/* Additional internal variables for 64-bit aligned data. */ +#define data1a r2 +#define data1b r3 +#define data2a r6 +#define data2b r7 +#define syndrome_a tmp1 +#define syndrome_b tmp2 + +/* Additional internal variables for 32-bit aligned data. */ +#define data1 r2 +#define data2 r3 +#define syndrome tmp2 + + + /* Macro to compute and return the result value for word-aligned + cases. */ + .macro strcmp_epilogue_aligned synd d1 d2 restore_r6 +#ifdef __ARM_BIG_ENDIAN + /* If data1 contains a zero byte, then syndrome will contain a 1 in + bit 7 of that byte. Otherwise, the highest set bit in the + syndrome will highlight the first different bit. It is therefore + sufficient to extract the eight bits starting with the syndrome + bit. */ + clz tmp1, \synd + lsl r1, \d2, tmp1 + .if \restore_r6 + ldrd r6, r7, [sp, #8] + .endif + lsl \d1, \d1, tmp1 + lsr result, \d1, #24 + ldrd r4, r5, [sp], #16 + cfi_remember_state + cfi_def_cfa_offset (0) + cfi_restore (r4) + cfi_restore (r5) + cfi_restore (r6) + cfi_restore (r7) + sub result, result, r1, lsr #24 + bx lr +#else + /* To use the big-endian trick we'd have to reverse all three words. + that's slower than this approach. */ + rev \synd, \synd + clz tmp1, \synd + bic tmp1, tmp1, #7 + lsr r1, \d2, tmp1 + .if \restore_r6 + ldrd r6, r7, [sp, #8] + .endif + lsr \d1, \d1, tmp1 + and result, \d1, #255 + and r1, r1, #255 + ldrd r4, r5, [sp], #16 + cfi_remember_state + cfi_def_cfa_offset (0) + cfi_restore (r4) + cfi_restore (r5) + cfi_restore (r6) + cfi_restore (r7) + sub result, result, r1 + + bx lr +#endif + .endm + + .text + .p2align 5 +.Lstrcmp_start_addr: +#if STRCMP_PRECHECK == 1 +.Lfastpath_exit: + sub r0, r2, r3 + bx lr + nop +#endif +ENTRY(strcmp) +#if STRCMP_PRECHECK == 1 + ldrb r2, [src1] + ldrb r3, [src2] + cmp r2, #1 + it cs + cmpcs r2, r3 + bne .Lfastpath_exit +#endif + strd r4, r5, [sp, #-16]! + cfi_def_cfa_offset (16) + cfi_offset (r4, -16) + cfi_offset (r5, -12) + orr tmp1, src1, src2 + strd r6, r7, [sp, #8] + cfi_offset (r6, -8) + cfi_offset (r7, -4) + mvn const_m1, #0 + lsl r2, tmp1, #29 + cbz r2, .Lloop_aligned8 + +.Lnot_aligned: + eor tmp1, src1, src2 + tst tmp1, #7 + bne .Lmisaligned8 + + /* Deal with mutual misalignment by aligning downwards and then + masking off the unwanted loaded data to prevent a difference. */ + and tmp1, src1, #7 + bic src1, src1, #7 + and tmp2, tmp1, #3 + bic src2, src2, #7 + lsl tmp2, tmp2, #3 /* Bytes -> bits. */ + ldrd data1a, data1b, [src1], #16 + tst tmp1, #4 + ldrd data2a, data2b, [src2], #16 + /* In thumb code we can't use MVN with a register shift, but + we do have ORN. */ + S2HI tmp1, const_m1, tmp2 + orn data1a, data1a, tmp1 + orn data2a, data2a, tmp1 + beq .Lstart_realigned8 + orn data1b, data1b, tmp1 + mov data1a, const_m1 + orn data2b, data2b, tmp1 + mov data2a, const_m1 + b .Lstart_realigned8 + + /* Unwind the inner loop by a factor of 2, giving 16 bytes per + pass. */ + .p2align 5,,12 /* Don't start in the tail bytes of a cache line. */ + .p2align 2 /* Always word aligned. */ +.Lloop_aligned8: + ldrd data1a, data1b, [src1], #16 + ldrd data2a, data2b, [src2], #16 +.Lstart_realigned8: + uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */ + eor syndrome_a, data1a, data2a + sel syndrome_a, syndrome_a, const_m1 + cbnz syndrome_a, .Ldiff_in_a + uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */ + eor syndrome_b, data1b, data2b + sel syndrome_b, syndrome_b, const_m1 + cbnz syndrome_b, .Ldiff_in_b + + ldrd data1a, data1b, [src1, #-8] + ldrd data2a, data2b, [src2, #-8] + uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */ + eor syndrome_a, data1a, data2a + sel syndrome_a, syndrome_a, const_m1 + uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */ + eor syndrome_b, data1b, data2b + sel syndrome_b, syndrome_b, const_m1 + /* Can't use CBZ for backwards branch. */ + orrs syndrome_b, syndrome_b, syndrome_a /* Only need if s_a == 0 */ + beq .Lloop_aligned8 + +.Ldiff_found: + cbnz syndrome_a, .Ldiff_in_a + +.Ldiff_in_b: + strcmp_epilogue_aligned syndrome_b, data1b, data2b 1 + +.Ldiff_in_a: + cfi_restore_state + strcmp_epilogue_aligned syndrome_a, data1a, data2a 1 + + cfi_restore_state +.Lmisaligned8: + tst tmp1, #3 + bne .Lmisaligned4 + ands tmp1, src1, #3 + bne .Lmutual_align4 + + /* Unrolled by a factor of 2, to reduce the number of post-increment + operations. */ +.Lloop_aligned4: + ldr data1, [src1], #8 + ldr data2, [src2], #8 +.Lstart_realigned4: + uadd8 syndrome, data1, const_m1 /* Only need GE bits. */ + eor syndrome, data1, data2 + sel syndrome, syndrome, const_m1 + cbnz syndrome, .Laligned4_done + ldr data1, [src1, #-4] + ldr data2, [src2, #-4] + uadd8 syndrome, data1, const_m1 + eor syndrome, data1, data2 + sel syndrome, syndrome, const_m1 + cmp syndrome, #0 + beq .Lloop_aligned4 + +.Laligned4_done: + strcmp_epilogue_aligned syndrome, data1, data2, 0 + +.Lmutual_align4: + cfi_restore_state + /* Deal with mutual misalignment by aligning downwards and then + masking off the unwanted loaded data to prevent a difference. */ + lsl tmp1, tmp1, #3 /* Bytes -> bits. */ + bic src1, src1, #3 + ldr data1, [src1], #8 + bic src2, src2, #3 + ldr data2, [src2], #8 + + /* In thumb code we can't use MVN with a register shift, but + we do have ORN. */ + S2HI tmp1, const_m1, tmp1 + orn data1, data1, tmp1 + orn data2, data2, tmp1 + b .Lstart_realigned4 + +.Lmisaligned4: + ands tmp1, src1, #3 + beq .Lsrc1_aligned + sub src2, src2, tmp1 + bic src1, src1, #3 + lsls tmp1, tmp1, #31 + ldr data1, [src1], #4 + beq .Laligned_m2 + bcs .Laligned_m1 + +#if STRCMP_PRECHECK == 0 + ldrb data2, [src2, #1] + uxtb tmp1, data1, ror #BYTE1_OFFSET + subs tmp1, tmp1, data2 + bne .Lmisaligned_exit + cbz data2, .Lmisaligned_exit + +.Laligned_m2: + ldrb data2, [src2, #2] + uxtb tmp1, data1, ror #BYTE2_OFFSET + subs tmp1, tmp1, data2 + bne .Lmisaligned_exit + cbz data2, .Lmisaligned_exit + +.Laligned_m1: + ldrb data2, [src2, #3] + uxtb tmp1, data1, ror #BYTE3_OFFSET + subs tmp1, tmp1, data2 + bne .Lmisaligned_exit + add src2, src2, #4 + cbnz data2, .Lsrc1_aligned +#else /* STRCMP_PRECHECK */ + /* If we've done the pre-check, then we don't need to check the + first byte again here. */ + ldrb data2, [src2, #2] + uxtb tmp1, data1, ror #BYTE2_OFFSET + subs tmp1, tmp1, data2 + bne .Lmisaligned_exit + cbz data2, .Lmisaligned_exit + +.Laligned_m2: + ldrb data2, [src2, #3] + uxtb tmp1, data1, ror #BYTE3_OFFSET + subs tmp1, tmp1, data2 + bne .Lmisaligned_exit + cbnz data2, .Laligned_m1 +#endif + +.Lmisaligned_exit: + mov result, tmp1 + ldr r4, [sp], #16 + cfi_remember_state + cfi_def_cfa_offset (0) + cfi_restore (r4) + cfi_restore (r5) + cfi_restore (r6) + cfi_restore (r7) + bx lr + +#if STRCMP_PRECHECK == 1 +.Laligned_m1: + add src2, src2, #4 +#endif +.Lsrc1_aligned: + cfi_restore_state + /* src1 is word aligned, but src2 has no common alignment + with it. */ + ldr data1, [src1], #4 + lsls tmp1, src2, #31 /* C=src2[1], Z=src2[0]. */ + + bic src2, src2, #3 + ldr data2, [src2], #4 + bhi .Loverlap1 /* C=1, Z=0 => src2[1:0] = 0b11. */ + bcs .Loverlap2 /* C=1, Z=1 => src2[1:0] = 0b10. */ + + /* (overlap3) C=0, Z=0 => src2[1:0] = 0b01. */ +.Loverlap3: + bic tmp1, data1, #MSB + uadd8 syndrome, data1, const_m1 + eors syndrome, tmp1, data2, S2LO #8 + sel syndrome, syndrome, const_m1 + bne 4f + cbnz syndrome, 5f + ldr data2, [src2], #4 + eor tmp1, tmp1, data1 + cmp tmp1, data2, S2HI #24 + bne 6f + ldr data1, [src1], #4 + b .Loverlap3 +4: + S2LO data2, data2, #8 + b .Lstrcmp_tail + +5: + bics syndrome, syndrome, #MSB + bne .Lstrcmp_done_equal + + /* We can only get here if the MSB of data1 contains 0, so + fast-path the exit. */ + ldrb result, [src2] + ldrd r4, r5, [sp], #16 + cfi_remember_state + cfi_def_cfa_offset (0) + cfi_restore (r4) + cfi_restore (r5) + /* R6/7 Not used in this sequence. */ + cfi_restore (r6) + cfi_restore (r7) + neg result, result + bx lr + +6: + cfi_restore_state + S2LO data1, data1, #24 + and data2, data2, #LSB + b .Lstrcmp_tail + + .p2align 5,,12 /* Ensure at least 3 instructions in cache line. */ +.Loverlap2: + and tmp1, data1, const_m1, S2LO #16 + uadd8 syndrome, data1, const_m1 + eors syndrome, tmp1, data2, S2LO #16 + sel syndrome, syndrome, const_m1 + bne 4f + cbnz syndrome, 5f + ldr data2, [src2], #4 + eor tmp1, tmp1, data1 + cmp tmp1, data2, S2HI #16 + bne 6f + ldr data1, [src1], #4 + b .Loverlap2 +4: + S2LO data2, data2, #16 + b .Lstrcmp_tail +5: + ands syndrome, syndrome, const_m1, S2LO #16 + bne .Lstrcmp_done_equal + + ldrh data2, [src2] + S2LO data1, data1, #16 +#ifdef __ARM_BIG_ENDIAN + lsl data2, data2, #16 +#endif + b .Lstrcmp_tail + +6: + S2LO data1, data1, #16 + and data2, data2, const_m1, S2LO #16 + b .Lstrcmp_tail + + .p2align 5,,12 /* Ensure at least 3 instructions in cache line. */ +.Loverlap1: + and tmp1, data1, #LSB + uadd8 syndrome, data1, const_m1 + eors syndrome, tmp1, data2, S2LO #24 + sel syndrome, syndrome, const_m1 + bne 4f + cbnz syndrome, 5f + ldr data2, [src2], #4 + eor tmp1, tmp1, data1 + cmp tmp1, data2, S2HI #8 + bne 6f + ldr data1, [src1], #4 + b .Loverlap1 +4: + S2LO data2, data2, #24 + b .Lstrcmp_tail +5: + tst syndrome, #LSB + bne .Lstrcmp_done_equal + ldr data2, [src2] +6: + S2LO data1, data1, #8 + bic data2, data2, #MSB + b .Lstrcmp_tail + +.Lstrcmp_done_equal: + mov result, #0 + ldrd r4, r5, [sp], #16 + cfi_remember_state + cfi_def_cfa_offset (0) + cfi_restore (r4) + cfi_restore (r5) + /* R6/7 not used in this sequence. */ + cfi_restore (r6) + cfi_restore (r7) + bx lr + +.Lstrcmp_tail: + cfi_restore_state +#ifndef __ARM_BIG_ENDIAN + rev data1, data1 + rev data2, data2 + /* Now everything looks big-endian... */ +#endif + uadd8 tmp1, data1, const_m1 + eor tmp1, data1, data2 + sel syndrome, tmp1, const_m1 + clz tmp1, syndrome + lsl data1, data1, tmp1 + lsl data2, data2, tmp1 + lsr result, data1, #24 + ldrd r4, r5, [sp], #16 + cfi_def_cfa_offset (0) + cfi_restore (r4) + cfi_restore (r5) + /* R6/7 not used in this sequence. */ + cfi_restore (r6) + cfi_restore (r7) + sub result, result, data2, lsr #24 + bx lr +END(strcmp) +libc_hidden_builtin_def (strcmp)