diff mbox

[android/bionic] Add ARM optimized strcmp()

Message ID BANLkTi=7YX3c=hiYw3GfHES8C=Q557MZoQ@mail.gmail.com
State New
Headers show

Commit Message

Jim Huang April 20, 2011, 7:43 a.m. UTC
Reference results of the experiments on TI OMAP3430 at 600 MHz

$ bench_strcmp -N "strcmp_1k" -s 1k -I 200

[original C code]
prc thr usecs/call samples errors cnt/samp size
strcmp_1k 1 1 10.38000 102 0 15000 1024

[ARM optimized code]
prc thr usecs/call samples errors cnt/samp size
strcmp_1k 1 1 3.08840 88 0 15000 1024

The work was derived from ARM Ltd, contributed to newlib, and reworked
for Android by Linaro.

Code Review:
    https://review.source.android.com/#change,22419

Comments

Jim Huang May 17, 2011, 11:36 a.m. UTC | #1
On 20 April 2011 15:43, Jim Huang <jim.huang@linaro.org> wrote:
> Reference results of the experiments on TI OMAP3430 at 600 MHz
[...]
> Code Review:
>    https://review.source.android.com/#change,22419

Merged in AOSP:
    http://android.git.kernel.org/?p=platform/bionic.git;a=commitdiff;h=f50e9be5930a08fa825b0c23353c802e11369b14
diff mbox

Patch

From f50e9be5930a08fa825b0c23353c802e11369b14 Mon Sep 17 00:00:00 2001
From: Jim Huang <jim.huang@linaro.org>
Date: Wed, 20 Apr 2011 15:35:04 +0800
Subject: [PATCH] bionic: Add ARM optimized strcmp()

Reference results of the experiments on TI OMAP3430 at 600 MHz

$ bench_strcmp -N "strcmp_1k" -s 1k -I 200

[original C code]
             prc thr   usecs/call      samples   errors cnt/samp     size
strcmp_1k      1   1     10.38000          102        0    15000     1024

[ARM optimized code]
             prc thr   usecs/call      samples   errors cnt/samp     size
strcmp_1k      1   1      3.08840           88        0    15000     1024

The work was derived from ARM Ltd, contributed to newlib, and reworked
for Android by Linaro.

Change-Id: Ib0d5755e1eb9adb07d80ef0252f57a5c4c57a425
Signed-off-by: Jim Huang <jserv@0xlab.org>
---
 libc/Android.mk               |    2 +-
 libc/arch-arm/bionic/strcmp.S |  321 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 322 insertions(+), 1 deletions(-)
 create mode 100644 libc/arch-arm/bionic/strcmp.S

diff --git a/libc/Android.mk b/libc/Android.mk
index d940753..2e7cd5a 100644
--- a/libc/Android.mk
+++ b/libc/Android.mk
@@ -360,10 +360,10 @@  libc_common_src_files += \
 	arch-arm/bionic/sigsetjmp.S \
 	arch-arm/bionic/strlen.c.arm \
 	arch-arm/bionic/strcpy.S \
+	arch-arm/bionic/strcmp.S \
 	arch-arm/bionic/syscall.S \
 	string/memmove.c.arm \
 	string/bcopy.c \
-	string/strcmp.c \
 	string/strncmp.c \
 	unistd/socketcalls.c
 
diff --git a/libc/arch-arm/bionic/strcmp.S b/libc/arch-arm/bionic/strcmp.S
new file mode 100644
index 0000000..9fdbd56
--- /dev/null
+++ b/libc/arch-arm/bionic/strcmp.S
@@ -0,0 +1,321 @@ 
+/*
+ * Copyright (c) 2011 The Android Open Source Project
+ * Copyright (c) 2008 ARM Ltd
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the company may not be used to endorse or promote
+ *    products derived from this software without specific prior written
+ *    permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <machine/cpu-features.h>
+
+	.text
+
+	.global strcmp
+	.type strcmp, %function
+	.align 4
+
+#ifdef __ARMEB__
+#define SHFT2LSB lsl
+#define SHFT2LSBEQ lsleq
+#define SHFT2MSB lsr
+#define SHFT2MSBEQ lsreq
+#define MSB 0x000000ff
+#define LSB 0xff000000
+#else
+#define SHFT2LSB lsr
+#define SHFT2LSBEQ lsreq
+#define SHFT2MSB lsl
+#define SHFT2MSBEQ lsleq
+#define MSB 0xff000000
+#define LSB 0x000000ff
+#endif
+
+#define magic1(REG) REG
+#define magic2(REG) REG, lsl #7
+
+strcmp:
+	.fnstart
+	PLD(r0, #0)
+	PLD(r1, #0)
+	eor	r2, r0, r1
+	tst	r2, #3
+
+	/* Strings not at same byte offset from a word boundary.  */
+	bne	.Lstrcmp_unaligned
+	ands	r2, r0, #3
+	bic	r0, r0, #3
+	bic	r1, r1, #3
+	ldr	ip, [r0], #4
+	it	eq
+	ldreq	r3, [r1], #4
+	beq	1f
+
+	/* Although s1 and s2 have identical initial alignment, they are
+	 * not currently word aligned.  Rather than comparing bytes,
+	 * make sure that any bytes fetched from before the addressed
+	 * bytes are forced to 0xff.  Then they will always compare
+	 * equal.
+	 */
+	eor	r2, r2, #3
+	lsl	r2, r2, #3
+	mvn	r3, #MSB
+	SHFT2LSB	r2, r3, r2
+	ldr	r3, [r1], #4
+	orr	ip, ip, r2
+	orr	r3, r3, r2
+1:
+	/* Load the 'magic' constant 0x01010101. */
+	str	r4, [sp, #-4]!
+	mov	r4, #1
+	orr	r4, r4, r4, lsl #8
+	orr	r4, r4, r4, lsl #16
+	.p2align	2
+4:
+	PLD(r0, #8)
+	PLD(r1, #8)
+	sub	r2, ip, magic1(r4)
+	cmp	ip, r3
+	itttt	eq
+
+	/* check for any zero bytes in first word */
+	biceq	r2, r2, ip
+	tsteq	r2, magic2(r4)
+	ldreq	ip, [r0], #4
+	ldreq	r3, [r1], #4
+	beq	4b
+2:
+	/* There's a zero or a different byte in the word */
+	SHFT2MSB	r0, ip, #24
+	SHFT2LSB	ip, ip, #8
+	cmp	r0, #1
+	it	cs
+	cmpcs	r0, r3, SHFT2MSB #24
+	it	eq
+	SHFT2LSBEQ r3, r3, #8
+	beq	2b
+	/* On a big-endian machine, r0 contains the desired byte in bits
+	 * 0-7; on a little-endian machine they are in bits 24-31.  In
+	 * both cases the other bits in r0 are all zero.  For r3 the
+	 * interesting byte is at the other end of the word, but the
+	 * other bits are not necessarily zero.  We need a signed result
+	 * representing the differnece in the unsigned bytes, so for the
+	 * little-endian case we can't just shift the interesting bits up.
+	 */
+#ifdef __ARMEB__
+	sub	r0, r0, r3, lsr #24
+#else
+	and	r3, r3, #255
+	/* No RSB instruction in Thumb2 */
+#ifdef __thumb2__
+	lsr	r0, r0, #24
+	sub	r0, r0, r3
+#else
+	rsb	r0, r3, r0, lsr #24
+#endif
+#endif
+	ldr	r4, [sp], #4
+	bx	lr
+	.fnend
+
+.Lstrcmp_unaligned:
+	wp1 .req r0
+	wp2 .req r1
+	b1  .req r2
+	w1  .req r4
+	w2  .req r5
+	t1  .req ip
+	@ r3 is scratch
+
+	/* First of all, compare bytes until wp1(sp1) is word-aligned. */
+1:
+	tst	wp1, #3
+	beq	2f
+	ldrb	r2, [wp1], #1
+	ldrb	r3, [wp2], #1
+	cmp	r2, #1
+	it	cs
+	cmpcs	r2, r3
+	beq	1b
+	sub	r0, r2, r3
+	bx	lr
+
+2:
+	str	r5, [sp, #-4]!
+	str	r4, [sp, #-4]!
+	mov	b1, #1
+	orr	b1, b1, b1, lsl #8
+	orr	b1, b1, b1, lsl #16
+
+	and	t1, wp2, #3
+	bic	wp2, wp2, #3
+	ldr	w1, [wp1], #4
+	ldr	w2, [wp2], #4
+	cmp	t1, #2
+	beq	2f
+	bhi	3f
+
+	/* Critical inner Loop: Block with 3 bytes initial overlap */
+	.p2align	2
+1:
+	bic	t1, w1, #MSB
+	cmp	t1, w2, SHFT2LSB #8
+	sub	r3, w1, b1
+	bic	r3, r3, w1
+	bne	4f
+	ands	r3, r3, b1, lsl #7
+	it	eq
+	ldreq	w2, [wp2], #4
+	bne	5f
+	eor	t1, t1, w1
+	cmp	t1, w2, SHFT2MSB #24
+	bne	6f
+	ldr	w1, [wp1], #4
+	b	1b
+4:
+	SHFT2LSB	w2, w2, #8
+	b	8f
+
+5:
+#ifdef __ARMEB__
+	/* The syndrome value may contain false ones if the string ends
+	 * with the bytes 0x01 0x00
+	 */
+	tst	w1, #0xff000000
+	itt	ne
+	tstne	w1, #0x00ff0000
+	tstne	w1, #0x0000ff00
+	beq	7f
+#else
+	bics	r3, r3, #0xff000000
+	bne	7f
+#endif
+	ldrb	w2, [wp2]
+	SHFT2LSB	t1, w1, #24
+#ifdef __ARMEB__
+	lsl	w2, w2, #24
+#endif
+	b	8f
+
+6:
+	SHFT2LSB	t1, w1, #24
+	and	w2, w2, #LSB
+	b	8f
+
+	/* Critical inner Loop: Block with 2 bytes initial overlap */
+	.p2align	2
+2:
+	SHFT2MSB	t1, w1, #16
+	sub	r3, w1, b1
+	SHFT2LSB	t1, t1, #16
+	bic	r3, r3, w1
+	cmp	t1, w2, SHFT2LSB #16
+	bne	4f
+	ands	r3, r3, b1, lsl #7
+	it	eq
+	ldreq	w2, [wp2], #4
+	bne	5f
+	eor	t1, t1, w1
+	cmp	t1, w2, SHFT2MSB #16
+	bne	6f
+	ldr	w1, [wp1], #4
+	b	2b
+
+5:
+#ifdef __ARMEB__
+	/* The syndrome value may contain false ones if the string ends
+	 * with the bytes 0x01 0x00
+	 */
+	tst	w1, #0xff000000
+	it	ne
+	tstne	w1, #0x00ff0000
+	beq	7f
+#else
+	lsls	r3, r3, #16
+	bne	7f
+#endif
+	ldrh	w2, [wp2]
+	SHFT2LSB	t1, w1, #16
+#ifdef __ARMEB__
+	lsl	w2, w2, #16
+#endif
+	b	8f
+
+6:
+	SHFT2MSB	w2, w2, #16
+	SHFT2LSB	t1, w1, #16
+4:
+	SHFT2LSB	w2, w2, #16
+	b	8f
+
+	/* Critical inner Loop: Block with 1 byte initial overlap */
+	.p2align	2
+3:
+	and	t1, w1, #LSB
+	cmp	t1, w2, SHFT2LSB #24
+	sub	r3, w1, b1
+	bic	r3, r3, w1
+	bne	4f
+	ands	r3, r3, b1, lsl #7
+	it	eq
+	ldreq	w2, [wp2], #4
+	bne	5f
+	eor	t1, t1, w1
+	cmp	t1, w2, SHFT2MSB #8
+	bne	6f
+	ldr	w1, [wp1], #4
+	b	3b
+4:
+	SHFT2LSB	w2, w2, #24
+	b	8f
+5:
+	/* The syndrome value may contain false ones if the string ends
+	 * with the bytes 0x01 0x00
+	 */
+	tst	w1, #LSB
+	beq	7f
+	ldr	w2, [wp2], #4
+6:
+	SHFT2LSB	t1, w1, #8
+	bic	w2, w2, #MSB
+	b	8f
+7:
+	mov	r0, #0
+	ldr	r4, [sp], #4
+	ldr	r5, [sp], #4
+	bx	lr
+
+8:
+	and	r2, t1, #LSB
+	and	r0, w2, #LSB
+	cmp	r0, #1
+	it	cs
+	cmpcs	r0, r2
+	itt	eq
+	SHFT2LSBEQ	t1, t1, #8
+	SHFT2LSBEQ	w2, w2, #8
+	beq	8b
+	sub	r0, r2, r0
+	ldr	r4, [sp], #4
+	ldr	r5, [sp], #4
+	bx	lr
-- 
1.7.4.1