From patchwork Wed Apr 20 07:43:17 2011 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Jim Huang X-Patchwork-Id: 1117 Return-Path: Delivered-To: unknown Received: from imap.gmail.com (74.125.159.109) by localhost6.localdomain6 with IMAP4-SSL; 08 Jun 2011 14:49:17 -0000 Delivered-To: patches@linaro.org Received: by 10.224.67.148 with SMTP id r20cs104185qai; Wed, 20 Apr 2011 00:43:18 -0700 (PDT) Received: by 10.216.140.92 with SMTP id d70mr2192633wej.105.1303285398156; Wed, 20 Apr 2011 00:43:18 -0700 (PDT) Received: from mail-ww0-f42.google.com (mail-ww0-f42.google.com [74.125.82.42]) by mx.google.com with ESMTPS id m28si1399629wej.142.2011.04.20.00.43.17 (version=TLSv1/SSLv3 cipher=OTHER); Wed, 20 Apr 2011 00:43:18 -0700 (PDT) Received-SPF: neutral (google.com: 74.125.82.42 is neither permitted nor denied by best guess record for domain of jim.huang@linaro.org) client-ip=74.125.82.42; Authentication-Results: mx.google.com; spf=neutral (google.com: 74.125.82.42 is neither permitted nor denied by best guess record for domain of jim.huang@linaro.org) smtp.mail=jim.huang@linaro.org Received: by wwk4 with SMTP id 4so3771376wwk.1 for ; Wed, 20 Apr 2011 00:43:17 -0700 (PDT) MIME-Version: 1.0 Received: by 10.216.152.170 with SMTP id d42mr2502716wek.39.1303285397580; Wed, 20 Apr 2011 00:43:17 -0700 (PDT) Received: by 10.216.8.193 with HTTP; Wed, 20 Apr 2011 00:43:17 -0700 (PDT) Date: Wed, 20 Apr 2011 15:43:17 +0800 Message-ID: Subject: [PATCH android/bionic] Add ARM optimized strcmp() From: Jim Huang To: linaro-dev Cc: patches Reference results of the experiments on TI OMAP3430 at 600 MHz $ bench_strcmp -N "strcmp_1k" -s 1k -I 200 [original C code] prc thr usecs/call samples errors cnt/samp size strcmp_1k 1 1 10.38000 102 0 15000 1024 [ARM optimized code] prc thr usecs/call samples errors cnt/samp size strcmp_1k 1 1 3.08840 88 0 15000 1024 The work was derived from ARM Ltd, contributed to newlib, and reworked for Android by Linaro. Code Review: https://review.source.android.com/#change,22419 >From f50e9be5930a08fa825b0c23353c802e11369b14 Mon Sep 17 00:00:00 2001 From: Jim Huang Date: Wed, 20 Apr 2011 15:35:04 +0800 Subject: [PATCH] bionic: Add ARM optimized strcmp() Reference results of the experiments on TI OMAP3430 at 600 MHz $ bench_strcmp -N "strcmp_1k" -s 1k -I 200 [original C code] prc thr usecs/call samples errors cnt/samp size strcmp_1k 1 1 10.38000 102 0 15000 1024 [ARM optimized code] prc thr usecs/call samples errors cnt/samp size strcmp_1k 1 1 3.08840 88 0 15000 1024 The work was derived from ARM Ltd, contributed to newlib, and reworked for Android by Linaro. Change-Id: Ib0d5755e1eb9adb07d80ef0252f57a5c4c57a425 Signed-off-by: Jim Huang --- libc/Android.mk | 2 +- libc/arch-arm/bionic/strcmp.S | 321 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 322 insertions(+), 1 deletions(-) create mode 100644 libc/arch-arm/bionic/strcmp.S diff --git a/libc/Android.mk b/libc/Android.mk index d940753..2e7cd5a 100644 --- a/libc/Android.mk +++ b/libc/Android.mk @@ -360,10 +360,10 @@ libc_common_src_files += \ arch-arm/bionic/sigsetjmp.S \ arch-arm/bionic/strlen.c.arm \ arch-arm/bionic/strcpy.S \ + arch-arm/bionic/strcmp.S \ arch-arm/bionic/syscall.S \ string/memmove.c.arm \ string/bcopy.c \ - string/strcmp.c \ string/strncmp.c \ unistd/socketcalls.c diff --git a/libc/arch-arm/bionic/strcmp.S b/libc/arch-arm/bionic/strcmp.S new file mode 100644 index 0000000..9fdbd56 --- /dev/null +++ b/libc/arch-arm/bionic/strcmp.S @@ -0,0 +1,321 @@ +/* + * Copyright (c) 2011 The Android Open Source Project + * Copyright (c) 2008 ARM Ltd + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the company may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED + * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include + + .text + + .global strcmp + .type strcmp, %function + .align 4 + +#ifdef __ARMEB__ +#define SHFT2LSB lsl +#define SHFT2LSBEQ lsleq +#define SHFT2MSB lsr +#define SHFT2MSBEQ lsreq +#define MSB 0x000000ff +#define LSB 0xff000000 +#else +#define SHFT2LSB lsr +#define SHFT2LSBEQ lsreq +#define SHFT2MSB lsl +#define SHFT2MSBEQ lsleq +#define MSB 0xff000000 +#define LSB 0x000000ff +#endif + +#define magic1(REG) REG +#define magic2(REG) REG, lsl #7 + +strcmp: + .fnstart + PLD(r0, #0) + PLD(r1, #0) + eor r2, r0, r1 + tst r2, #3 + + /* Strings not at same byte offset from a word boundary. */ + bne .Lstrcmp_unaligned + ands r2, r0, #3 + bic r0, r0, #3 + bic r1, r1, #3 + ldr ip, [r0], #4 + it eq + ldreq r3, [r1], #4 + beq 1f + + /* Although s1 and s2 have identical initial alignment, they are + * not currently word aligned. Rather than comparing bytes, + * make sure that any bytes fetched from before the addressed + * bytes are forced to 0xff. Then they will always compare + * equal. + */ + eor r2, r2, #3 + lsl r2, r2, #3 + mvn r3, #MSB + SHFT2LSB r2, r3, r2 + ldr r3, [r1], #4 + orr ip, ip, r2 + orr r3, r3, r2 +1: + /* Load the 'magic' constant 0x01010101. */ + str r4, [sp, #-4]! + mov r4, #1 + orr r4, r4, r4, lsl #8 + orr r4, r4, r4, lsl #16 + .p2align 2 +4: + PLD(r0, #8) + PLD(r1, #8) + sub r2, ip, magic1(r4) + cmp ip, r3 + itttt eq + + /* check for any zero bytes in first word */ + biceq r2, r2, ip + tsteq r2, magic2(r4) + ldreq ip, [r0], #4 + ldreq r3, [r1], #4 + beq 4b +2: + /* There's a zero or a different byte in the word */ + SHFT2MSB r0, ip, #24 + SHFT2LSB ip, ip, #8 + cmp r0, #1 + it cs + cmpcs r0, r3, SHFT2MSB #24 + it eq + SHFT2LSBEQ r3, r3, #8 + beq 2b + /* On a big-endian machine, r0 contains the desired byte in bits + * 0-7; on a little-endian machine they are in bits 24-31. In + * both cases the other bits in r0 are all zero. For r3 the + * interesting byte is at the other end of the word, but the + * other bits are not necessarily zero. We need a signed result + * representing the differnece in the unsigned bytes, so for the + * little-endian case we can't just shift the interesting bits up. + */ +#ifdef __ARMEB__ + sub r0, r0, r3, lsr #24 +#else + and r3, r3, #255 + /* No RSB instruction in Thumb2 */ +#ifdef __thumb2__ + lsr r0, r0, #24 + sub r0, r0, r3 +#else + rsb r0, r3, r0, lsr #24 +#endif +#endif + ldr r4, [sp], #4 + bx lr + .fnend + +.Lstrcmp_unaligned: + wp1 .req r0 + wp2 .req r1 + b1 .req r2 + w1 .req r4 + w2 .req r5 + t1 .req ip + @ r3 is scratch + + /* First of all, compare bytes until wp1(sp1) is word-aligned. */ +1: + tst wp1, #3 + beq 2f + ldrb r2, [wp1], #1 + ldrb r3, [wp2], #1 + cmp r2, #1 + it cs + cmpcs r2, r3 + beq 1b + sub r0, r2, r3 + bx lr + +2: + str r5, [sp, #-4]! + str r4, [sp, #-4]! + mov b1, #1 + orr b1, b1, b1, lsl #8 + orr b1, b1, b1, lsl #16 + + and t1, wp2, #3 + bic wp2, wp2, #3 + ldr w1, [wp1], #4 + ldr w2, [wp2], #4 + cmp t1, #2 + beq 2f + bhi 3f + + /* Critical inner Loop: Block with 3 bytes initial overlap */ + .p2align 2 +1: + bic t1, w1, #MSB + cmp t1, w2, SHFT2LSB #8 + sub r3, w1, b1 + bic r3, r3, w1 + bne 4f + ands r3, r3, b1, lsl #7 + it eq + ldreq w2, [wp2], #4 + bne 5f + eor t1, t1, w1 + cmp t1, w2, SHFT2MSB #24 + bne 6f + ldr w1, [wp1], #4 + b 1b +4: + SHFT2LSB w2, w2, #8 + b 8f + +5: +#ifdef __ARMEB__ + /* The syndrome value may contain false ones if the string ends + * with the bytes 0x01 0x00 + */ + tst w1, #0xff000000 + itt ne + tstne w1, #0x00ff0000 + tstne w1, #0x0000ff00 + beq 7f +#else + bics r3, r3, #0xff000000 + bne 7f +#endif + ldrb w2, [wp2] + SHFT2LSB t1, w1, #24 +#ifdef __ARMEB__ + lsl w2, w2, #24 +#endif + b 8f + +6: + SHFT2LSB t1, w1, #24 + and w2, w2, #LSB + b 8f + + /* Critical inner Loop: Block with 2 bytes initial overlap */ + .p2align 2 +2: + SHFT2MSB t1, w1, #16 + sub r3, w1, b1 + SHFT2LSB t1, t1, #16 + bic r3, r3, w1 + cmp t1, w2, SHFT2LSB #16 + bne 4f + ands r3, r3, b1, lsl #7 + it eq + ldreq w2, [wp2], #4 + bne 5f + eor t1, t1, w1 + cmp t1, w2, SHFT2MSB #16 + bne 6f + ldr w1, [wp1], #4 + b 2b + +5: +#ifdef __ARMEB__ + /* The syndrome value may contain false ones if the string ends + * with the bytes 0x01 0x00 + */ + tst w1, #0xff000000 + it ne + tstne w1, #0x00ff0000 + beq 7f +#else + lsls r3, r3, #16 + bne 7f +#endif + ldrh w2, [wp2] + SHFT2LSB t1, w1, #16 +#ifdef __ARMEB__ + lsl w2, w2, #16 +#endif + b 8f + +6: + SHFT2MSB w2, w2, #16 + SHFT2LSB t1, w1, #16 +4: + SHFT2LSB w2, w2, #16 + b 8f + + /* Critical inner Loop: Block with 1 byte initial overlap */ + .p2align 2 +3: + and t1, w1, #LSB + cmp t1, w2, SHFT2LSB #24 + sub r3, w1, b1 + bic r3, r3, w1 + bne 4f + ands r3, r3, b1, lsl #7 + it eq + ldreq w2, [wp2], #4 + bne 5f + eor t1, t1, w1 + cmp t1, w2, SHFT2MSB #8 + bne 6f + ldr w1, [wp1], #4 + b 3b +4: + SHFT2LSB w2, w2, #24 + b 8f +5: + /* The syndrome value may contain false ones if the string ends + * with the bytes 0x01 0x00 + */ + tst w1, #LSB + beq 7f + ldr w2, [wp2], #4 +6: + SHFT2LSB t1, w1, #8 + bic w2, w2, #MSB + b 8f +7: + mov r0, #0 + ldr r4, [sp], #4 + ldr r5, [sp], #4 + bx lr + +8: + and r2, t1, #LSB + and r0, w2, #LSB + cmp r0, #1 + it cs + cmpcs r0, r2 + itt eq + SHFT2LSBEQ t1, t1, #8 + SHFT2LSBEQ w2, w2, #8 + beq 8b + sub r0, r2, r0 + ldr r4, [sp], #4 + ldr r5, [sp], #4 + bx lr -- 1.7.4.1