From patchwork Fri Jul 25 15:43:20 2014 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Ian Campbell X-Patchwork-Id: 34293 Return-Path: X-Original-To: linaro@patches.linaro.org Delivered-To: linaro@patches.linaro.org Received: from mail-ie0-f197.google.com (mail-ie0-f197.google.com [209.85.223.197]) by ip-10-151-82-157.ec2.internal (Postfix) with ESMTPS id 11CF6235AB for ; Fri, 25 Jul 2014 15:45:31 +0000 (UTC) Received: by mail-ie0-f197.google.com with SMTP id rp18sf28003390iec.4 for ; Fri, 25 Jul 2014 08:45:30 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20130820; h=x-gm-message-state:delivered-to:message-id:from:to:date:in-reply-to :references:organization:mime-version:cc:subject:precedence:list-id :list-unsubscribe:list-post:list-help:list-subscribe:sender :errors-to:x-original-sender:x-original-authentication-results :mailing-list:list-archive:content-type:content-transfer-encoding; bh=7sBVc/9KpTGmkK/QlqFiUg6d+rTEbKmw3O4pojUH2rU=; b=QWlC4gxVlTgxeHzJj2rmUKiKzU1bFB4LuYV09D0GX9hFhfzK3J5hhPzNslyHn8/Pdp w5lHlRB4DAJFWZ2fv16EzdwekRmExMHJLr0RHuyv/Q15tHZKJ3MmWbf+zxdDT87ziJUw 1M2yYmbTJN+hF5kCUnWwStdr0JYGy5qD+Ij1mJFIsuPuy4p4FKj3lT0D19tUgXABsKAc 7FRk2wQ3UuKFC9XKTxGMO55BhAn3ukLohQtGp1s4dThP61x3y5RKXuJvBTrb1MdeFNcP sZOAN+rvgc/P+pb1KTPcDhs3nEE/PZmi+OvN3a3MfyZOLGK9UnIXONS0lf+ekA8SE59q DmGA== X-Gm-Message-State: ALoCoQkODSgfKjpbsDeoqss85VTZUB4NpgcQgKbW6NAIjZvfP1NtW772WGhM77b3mrilpONGIi3A X-Received: by 10.182.91.108 with SMTP id cd12mr8634260obb.13.1406303130603; Fri, 25 Jul 2014 08:45:30 -0700 (PDT) X-BeenThere: patchwork-forward@linaro.org Received: by 10.140.39.39 with SMTP id u36ls508467qgu.63.gmail; Fri, 25 Jul 2014 08:45:30 -0700 (PDT) X-Received: by 10.52.138.7 with SMTP id qm7mr18473482vdb.7.1406303130408; Fri, 25 Jul 2014 08:45:30 -0700 (PDT) Received: from mail-vc0-f171.google.com (mail-vc0-f171.google.com [209.85.220.171]) by mx.google.com with ESMTPS id h4si7731467vdq.43.2014.07.25.08.45.30 for (version=TLSv1 cipher=ECDHE-RSA-RC4-SHA bits=128/128); Fri, 25 Jul 2014 08:45:30 -0700 (PDT) Received-SPF: pass (google.com: domain of patch+caf_=patchwork-forward=linaro.org@linaro.org designates 209.85.220.171 as permitted sender) client-ip=209.85.220.171; Received: by mail-vc0-f171.google.com with SMTP id hq11so7633643vcb.30 for ; Fri, 25 Jul 2014 08:45:30 -0700 (PDT) X-Received: by 10.52.129.200 with SMTP id ny8mr18361152vdb.27.1406303130136; Fri, 25 Jul 2014 08:45:30 -0700 (PDT) X-Forwarded-To: patchwork-forward@linaro.org X-Forwarded-For: patch@linaro.org patchwork-forward@linaro.org Delivered-To: patch@linaro.org Received: by 10.221.37.5 with SMTP id tc5csp47633vcb; Fri, 25 Jul 2014 08:45:29 -0700 (PDT) X-Received: by 10.50.25.162 with SMTP id d2mr6926588igg.0.1406303129305; Fri, 25 Jul 2014 08:45:29 -0700 (PDT) Received: from lists.xen.org (lists.xen.org. [50.57.142.19]) by mx.google.com with ESMTPS id gb4si22462175icb.75.2014.07.25.08.45.28 for (version=TLSv1 cipher=RC4-SHA bits=128/128); Fri, 25 Jul 2014 08:45:29 -0700 (PDT) Received-SPF: none (google.com: xen-devel-bounces@lists.xen.org does not designate permitted sender hosts) client-ip=50.57.142.19; Received: from localhost ([127.0.0.1] helo=lists.xen.org) by lists.xen.org with esmtp (Exim 4.72) (envelope-from ) id 1XAheY-0005ko-OV; Fri, 25 Jul 2014 15:43:26 +0000 Received: from mail6.bemta4.messagelabs.com ([85.158.143.247]) by lists.xen.org with esmtp (Exim 4.72) (envelope-from ) id 1XAheX-0005kZ-IS for xen-devel@lists.xen.org; Fri, 25 Jul 2014 15:43:25 +0000 Received: from [85.158.143.35:58048] by server-2.bemta-4.messagelabs.com id 6A/67-04525-C1B72D35; Fri, 25 Jul 2014 15:43:24 +0000 X-Env-Sender: Ian.Campbell@citrix.com X-Msg-Ref: server-9.tower-21.messagelabs.com!1406303002!20256490!1 X-Originating-IP: [66.165.176.89] X-SpamReason: No, hits=0.0 required=7.0 tests=sa_preprocessor: VHJ1c3RlZCBJUDogNjYuMTY1LjE3Ni44OSA9PiAyMDMwMDc=\n X-StarScan-Received: X-StarScan-Version: 6.11.3; banners=-,-,- X-VirusChecked: Checked Received: (qmail 25272 invoked from network); 25 Jul 2014 15:43:23 -0000 Received: from smtp.citrix.com (HELO SMTP.CITRIX.COM) (66.165.176.89) by server-9.tower-21.messagelabs.com with RC4-SHA encrypted SMTP; 25 Jul 2014 15:43:23 -0000 X-IronPort-AV: E=Sophos;i="5.01,731,1400025600"; d="scan'208";a="155718002" Received: from accessns.citrite.net (HELO FTLPEX01CL03.citrite.net) ([10.9.154.239]) by FTLPIPO01.CITRIX.COM with ESMTP; 25 Jul 2014 15:43:23 +0000 Received: from kazak.uk.xensource.com (10.80.2.80) by FTLPEX01CL03.citrite.net (10.13.107.80) with Microsoft SMTP Server id 14.3.181.6; Fri, 25 Jul 2014 11:43:21 -0400 Message-ID: <1406303000.24842.37.camel@kazak.uk.xensource.com> From: Ian Campbell To: Date: Fri, 25 Jul 2014 16:43:20 +0100 In-Reply-To: <80a33cc325055bc9d63e4ef272c5b7f68f8fa812.1406301772.git.ian.campbell@citrix.com> References: <80a33cc325055bc9d63e4ef272c5b7f68f8fa812.1406301772.git.ian.campbell@citrix.com> Organization: Citrix Systems, Inc. X-Mailer: Evolution 3.12.2-1 MIME-Version: 1.0 X-Originating-IP: [10.80.2.80] X-DLP: MIA1 Cc: julien.grall@linaro.org, tim@xen.org, stefano.stabellini@eu.citrix.com Subject: Re: [Xen-devel] [PATCH 1/2] xen: arm: update arm64 assembly primitives to Linux v3.16-rc6 X-BeenThere: xen-devel@lists.xen.org X-Mailman-Version: 2.1.13 Precedence: list List-Id: List-Unsubscribe: , List-Post: , List-Help: , List-Subscribe: , Sender: xen-devel-bounces@lists.xen.org Errors-To: xen-devel-bounces@lists.xen.org X-Removed-Original-Auth: Dkim didn't pass. X-Original-Sender: ian.campbell@citrix.com X-Original-Authentication-Results: mx.google.com; spf=pass (google.com: domain of patch+caf_=patchwork-forward=linaro.org@linaro.org designates 209.85.220.171 as permitted sender) smtp.mail=patch+caf_=patchwork-forward=linaro.org@linaro.org Mailing-list: list patchwork-forward@linaro.org; contact patchwork-forward+owners@linaro.org X-Google-Group-Id: 836684582541 List-Archive: On Fri, 2014-07-25 at 16:22 +0100, Ian Campbell wrote: > str*: No changes. Record new baseline. I missed that there were some new primitives (str[n]len and str[n]cmp). Rather than respin this big patch here is a followup: 8<------------------- >From 66c115115122ca21035d55f486ea2eed1e284dd7 Mon Sep 17 00:00:00 2001 Message-Id: <66c115115122ca21035d55f486ea2eed1e284dd7.1406302952.git.ian.campbell@citrix.com> From: Ian Campbell Date: Fri, 25 Jul 2014 16:31:46 +0100 Subject: [PATCH] xen: arm: Add new str* primitives from Linux v3.16-rc6. Imports: 0a42cb0 arm64: lib: Implement optimized string length routines Author: zhichang.yuan Signed-off-by: Zhichang Yuan Signed-off-by: Deepak Saxena Signed-off-by: Catalin Marinas 192c4d9 arm64: lib: Implement optimized string compare routines Author: zhichang.yuan Signed-off-by: Zhichang Yuan Signed-off-by: Deepak Saxena Signed-off-by: Catalin Marinas Signed-off-by: Ian Campbell --- xen/arch/arm/README.LinuxPrimitives | 10 +- xen/arch/arm/arm64/lib/Makefile | 2 +- xen/arch/arm/arm64/lib/strcmp.S | 235 ++++++++++++++++++++++++++ xen/arch/arm/arm64/lib/strlen.S | 128 ++++++++++++++ xen/arch/arm/arm64/lib/strncmp.S | 311 +++++++++++++++++++++++++++++++++++ xen/arch/arm/arm64/lib/strnlen.S | 172 +++++++++++++++++++ xen/include/asm-arm/string.h | 14 ++ 7 files changed, 870 insertions(+), 2 deletions(-) create mode 100644 xen/arch/arm/arm64/lib/strcmp.S create mode 100644 xen/arch/arm/arm64/lib/strlen.S create mode 100644 xen/arch/arm/arm64/lib/strncmp.S create mode 100644 xen/arch/arm/arm64/lib/strnlen.S diff --git a/xen/arch/arm/README.LinuxPrimitives b/xen/arch/arm/README.LinuxPrimitives index 7e15b04..7f33fc7 100644 --- a/xen/arch/arm/README.LinuxPrimitives +++ b/xen/arch/arm/README.LinuxPrimitives @@ -49,11 +49,19 @@ done --------------------------------------------------------------------- -str*: last sync @ v3.16-rc6 (last commit: 2b8cac814cd5) +str*: last sync @ v3.16-rc6 (last commit: 0a42cb0a6fa6) linux/arch/arm64/lib/strchr.S xen/arch/arm/arm64/lib/strchr.S +linux/arch/arm64/lib/strcmp.S xen/arch/arm/arm64/lib/strcmp.S +linux/arch/arm64/lib/strlen.S xen/arch/arm/arm64/lib/strlen.S +linux/arch/arm64/lib/strncmp.S xen/arch/arm/arm64/lib/strncmp.S +linux/arch/arm64/lib/strnlen.S xen/arch/arm/arm64/lib/strnlen.S linux/arch/arm64/lib/strrchr.S xen/arch/arm/arm64/lib/strrchr.S +for i in strchr.S strcmp.S strlen.S strncmp.S strnlen.S strrchr.S ; do + diff -u linux/arch/arm64/lib/$i xen/arch/arm/arm64/lib/$i +done + --------------------------------------------------------------------- {clear,copy}_page: last sync @ v3.16-rc6 (last commit: f27bb139c387) diff --git a/xen/arch/arm/arm64/lib/Makefile b/xen/arch/arm/arm64/lib/Makefile index 2e7fb64..1b9c7a9 100644 --- a/xen/arch/arm/arm64/lib/Makefile +++ b/xen/arch/arm/arm64/lib/Makefile @@ -1,4 +1,4 @@ obj-y += memcpy.o memcmp.o memmove.o memset.o memchr.o obj-y += clear_page.o obj-y += bitops.o find_next_bit.o -obj-y += strchr.o strrchr.o +obj-y += strchr.o strcmp.o strlen.o strncmp.o strnlen.o strrchr.o diff --git a/xen/arch/arm/arm64/lib/strcmp.S b/xen/arch/arm/arm64/lib/strcmp.S new file mode 100644 index 0000000..bdcf7b0 --- /dev/null +++ b/xen/arch/arm/arm64/lib/strcmp.S @@ -0,0 +1,235 @@ +/* + * Copyright (C) 2013 ARM Ltd. + * Copyright (C) 2013 Linaro. + * + * This code is based on glibc cortex strings work originally authored by Linaro + * and re-licensed under GPLv2 for the Linux kernel. The original code can + * be found @ + * + * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ + * files/head:/src/aarch64/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include + +#include "assembler.h" + +/* + * compare two strings + * + * Parameters: + * x0 - const string 1 pointer + * x1 - const string 2 pointer + * Returns: + * x0 - an integer less than, equal to, or greater than zero + * if s1 is found, respectively, to be less than, to match, + * or be greater than s2. + */ + +#define REP8_01 0x0101010101010101 +#define REP8_7f 0x7f7f7f7f7f7f7f7f +#define REP8_80 0x8080808080808080 + +/* Parameters and result. */ +src1 .req x0 +src2 .req x1 +result .req x0 + +/* Internal variables. */ +data1 .req x2 +data1w .req w2 +data2 .req x3 +data2w .req w3 +has_nul .req x4 +diff .req x5 +syndrome .req x6 +tmp1 .req x7 +tmp2 .req x8 +tmp3 .req x9 +zeroones .req x10 +pos .req x11 + +ENTRY(strcmp) + eor tmp1, src1, src2 + mov zeroones, #REP8_01 + tst tmp1, #7 + b.ne .Lmisaligned8 + ands tmp1, src1, #7 + b.ne .Lmutual_align + + /* + * NUL detection works on the principle that (X - 1) & (~X) & 0x80 + * (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and + * can be done in parallel across the entire word. + */ +.Lloop_aligned: + ldr data1, [src1], #8 + ldr data2, [src2], #8 +.Lstart_realigned: + sub tmp1, data1, zeroones + orr tmp2, data1, #REP8_7f + eor diff, data1, data2 /* Non-zero if differences found. */ + bic has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ + orr syndrome, diff, has_nul + cbz syndrome, .Lloop_aligned + b .Lcal_cmpresult + +.Lmutual_align: + /* + * Sources are mutually aligned, but are not currently at an + * alignment boundary. Round down the addresses and then mask off + * the bytes that preceed the start point. + */ + bic src1, src1, #7 + bic src2, src2, #7 + lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */ + ldr data1, [src1], #8 + neg tmp1, tmp1 /* Bits to alignment -64. */ + ldr data2, [src2], #8 + mov tmp2, #~0 + /* Big-endian. Early bytes are at MSB. */ +CPU_BE( lsl tmp2, tmp2, tmp1 ) /* Shift (tmp1 & 63). */ + /* Little-endian. Early bytes are at LSB. */ +CPU_LE( lsr tmp2, tmp2, tmp1 ) /* Shift (tmp1 & 63). */ + + orr data1, data1, tmp2 + orr data2, data2, tmp2 + b .Lstart_realigned + +.Lmisaligned8: + /* + * Get the align offset length to compare per byte first. + * After this process, one string's address will be aligned. + */ + and tmp1, src1, #7 + neg tmp1, tmp1 + add tmp1, tmp1, #8 + and tmp2, src2, #7 + neg tmp2, tmp2 + add tmp2, tmp2, #8 + subs tmp3, tmp1, tmp2 + csel pos, tmp1, tmp2, hi /*Choose the maximum. */ +.Ltinycmp: + ldrb data1w, [src1], #1 + ldrb data2w, [src2], #1 + subs pos, pos, #1 + ccmp data1w, #1, #0, ne /* NZCV = 0b0000. */ + ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ + b.eq .Ltinycmp + cbnz pos, 1f /*find the null or unequal...*/ + cmp data1w, #1 + ccmp data1w, data2w, #0, cs + b.eq .Lstart_align /*the last bytes are equal....*/ +1: + sub result, data1, data2 + ret + +.Lstart_align: + ands xzr, src1, #7 + b.eq .Lrecal_offset + /*process more leading bytes to make str1 aligned...*/ + add src1, src1, tmp3 + add src2, src2, tmp3 + /*load 8 bytes from aligned str1 and non-aligned str2..*/ + ldr data1, [src1], #8 + ldr data2, [src2], #8 + + sub tmp1, data1, zeroones + orr tmp2, data1, #REP8_7f + bic has_nul, tmp1, tmp2 + eor diff, data1, data2 /* Non-zero if differences found. */ + orr syndrome, diff, has_nul + cbnz syndrome, .Lcal_cmpresult + /*How far is the current str2 from the alignment boundary...*/ + and tmp3, tmp3, #7 +.Lrecal_offset: + neg pos, tmp3 +.Lloopcmp_proc: + /* + * Divide the eight bytes into two parts. First,backwards the src2 + * to an alignment boundary,load eight bytes from the SRC2 alignment + * boundary,then compare with the relative bytes from SRC1. + * If all 8 bytes are equal,then start the second part's comparison. + * Otherwise finish the comparison. + * This special handle can garantee all the accesses are in the + * thread/task space in avoid to overrange access. + */ + ldr data1, [src1,pos] + ldr data2, [src2,pos] + sub tmp1, data1, zeroones + orr tmp2, data1, #REP8_7f + bic has_nul, tmp1, tmp2 + eor diff, data1, data2 /* Non-zero if differences found. */ + orr syndrome, diff, has_nul + cbnz syndrome, .Lcal_cmpresult + + /*The second part process*/ + ldr data1, [src1], #8 + ldr data2, [src2], #8 + sub tmp1, data1, zeroones + orr tmp2, data1, #REP8_7f + bic has_nul, tmp1, tmp2 + eor diff, data1, data2 /* Non-zero if differences found. */ + orr syndrome, diff, has_nul + cbz syndrome, .Lloopcmp_proc + +.Lcal_cmpresult: + /* + * reversed the byte-order as big-endian,then CLZ can find the most + * significant zero bits. + */ +CPU_LE( rev syndrome, syndrome ) +CPU_LE( rev data1, data1 ) +CPU_LE( rev data2, data2 ) + + /* + * For big-endian we cannot use the trick with the syndrome value + * as carry-propagation can corrupt the upper bits if the trailing + * bytes in the string contain 0x01. + * However, if there is no NUL byte in the dword, we can generate + * the result directly. We ca not just subtract the bytes as the + * MSB might be significant. + */ +CPU_BE( cbnz has_nul, 1f ) +CPU_BE( cmp data1, data2 ) +CPU_BE( cset result, ne ) +CPU_BE( cneg result, result, lo ) +CPU_BE( ret ) +CPU_BE( 1: ) + /*Re-compute the NUL-byte detection, using a byte-reversed value. */ +CPU_BE( rev tmp3, data1 ) +CPU_BE( sub tmp1, tmp3, zeroones ) +CPU_BE( orr tmp2, tmp3, #REP8_7f ) +CPU_BE( bic has_nul, tmp1, tmp2 ) +CPU_BE( rev has_nul, has_nul ) +CPU_BE( orr syndrome, diff, has_nul ) + + clz pos, syndrome + /* + * The MS-non-zero bit of the syndrome marks either the first bit + * that is different, or the top bit of the first zero byte. + * Shifting left now will bring the critical information into the + * top bits. + */ + lsl data1, data1, pos + lsl data2, data2, pos + /* + * But we need to zero-extend (char is unsigned) the value and then + * perform a signed 32-bit subtraction. + */ + lsr data1, data1, #56 + sub result, data1, data2, lsr #56 + ret +ENDPROC(strcmp) diff --git a/xen/arch/arm/arm64/lib/strlen.S b/xen/arch/arm/arm64/lib/strlen.S new file mode 100644 index 0000000..ee055a2 --- /dev/null +++ b/xen/arch/arm/arm64/lib/strlen.S @@ -0,0 +1,128 @@ +/* + * Copyright (C) 2013 ARM Ltd. + * Copyright (C) 2013 Linaro. + * + * This code is based on glibc cortex strings work originally authored by Linaro + * and re-licensed under GPLv2 for the Linux kernel. The original code can + * be found @ + * + * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ + * files/head:/src/aarch64/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include + +#include "assembler.h" + + +/* + * calculate the length of a string + * + * Parameters: + * x0 - const string pointer + * Returns: + * x0 - the return length of specific string + */ + +/* Arguments and results. */ +srcin .req x0 +len .req x0 + +/* Locals and temporaries. */ +src .req x1 +data1 .req x2 +data2 .req x3 +data2a .req x4 +has_nul1 .req x5 +has_nul2 .req x6 +tmp1 .req x7 +tmp2 .req x8 +tmp3 .req x9 +tmp4 .req x10 +zeroones .req x11 +pos .req x12 + +#define REP8_01 0x0101010101010101 +#define REP8_7f 0x7f7f7f7f7f7f7f7f +#define REP8_80 0x8080808080808080 + +ENTRY(strlen) + mov zeroones, #REP8_01 + bic src, srcin, #15 + ands tmp1, srcin, #15 + b.ne .Lmisaligned + /* + * NUL detection works on the principle that (X - 1) & (~X) & 0x80 + * (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and + * can be done in parallel across the entire word. + */ + /* + * The inner loop deals with two Dwords at a time. This has a + * slightly higher start-up cost, but we should win quite quickly, + * especially on cores with a high number of issue slots per + * cycle, as we get much better parallelism out of the operations. + */ +.Lloop: + ldp data1, data2, [src], #16 +.Lrealigned: + sub tmp1, data1, zeroones + orr tmp2, data1, #REP8_7f + sub tmp3, data2, zeroones + orr tmp4, data2, #REP8_7f + bic has_nul1, tmp1, tmp2 + bics has_nul2, tmp3, tmp4 + ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */ + b.eq .Lloop + + sub len, src, srcin + cbz has_nul1, .Lnul_in_data2 +CPU_BE( mov data2, data1 ) /*prepare data to re-calculate the syndrome*/ + sub len, len, #8 + mov has_nul2, has_nul1 +.Lnul_in_data2: + /* + * For big-endian, carry propagation (if the final byte in the + * string is 0x01) means we cannot use has_nul directly. The + * easiest way to get the correct byte is to byte-swap the data + * and calculate the syndrome a second time. + */ +CPU_BE( rev data2, data2 ) +CPU_BE( sub tmp1, data2, zeroones ) +CPU_BE( orr tmp2, data2, #REP8_7f ) +CPU_BE( bic has_nul2, tmp1, tmp2 ) + + sub len, len, #8 + rev has_nul2, has_nul2 + clz pos, has_nul2 + add len, len, pos, lsr #3 /* Bits to bytes. */ + ret + +.Lmisaligned: + cmp tmp1, #8 + neg tmp1, tmp1 + ldp data1, data2, [src], #16 + lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */ + mov tmp2, #~0 + /* Big-endian. Early bytes are at MSB. */ +CPU_BE( lsl tmp2, tmp2, tmp1 ) /* Shift (tmp1 & 63). */ + /* Little-endian. Early bytes are at LSB. */ +CPU_LE( lsr tmp2, tmp2, tmp1 ) /* Shift (tmp1 & 63). */ + + orr data1, data1, tmp2 + orr data2a, data2, tmp2 + csinv data1, data1, xzr, le + csel data2, data2, data2a, le + b .Lrealigned +ENDPROC(strlen) diff --git a/xen/arch/arm/arm64/lib/strncmp.S b/xen/arch/arm/arm64/lib/strncmp.S new file mode 100644 index 0000000..ca2e4a6 --- /dev/null +++ b/xen/arch/arm/arm64/lib/strncmp.S @@ -0,0 +1,311 @@ +/* + * Copyright (C) 2013 ARM Ltd. + * Copyright (C) 2013 Linaro. + * + * This code is based on glibc cortex strings work originally authored by Linaro + * and re-licensed under GPLv2 for the Linux kernel. The original code can + * be found @ + * + * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ + * files/head:/src/aarch64/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include + +#include "assembler.h" + +/* + * compare two strings + * + * Parameters: + * x0 - const string 1 pointer + * x1 - const string 2 pointer + * x2 - the maximal length to be compared + * Returns: + * x0 - an integer less than, equal to, or greater than zero if s1 is found, + * respectively, to be less than, to match, or be greater than s2. + */ + +#define REP8_01 0x0101010101010101 +#define REP8_7f 0x7f7f7f7f7f7f7f7f +#define REP8_80 0x8080808080808080 + +/* Parameters and result. */ +src1 .req x0 +src2 .req x1 +limit .req x2 +result .req x0 + +/* Internal variables. */ +data1 .req x3 +data1w .req w3 +data2 .req x4 +data2w .req w4 +has_nul .req x5 +diff .req x6 +syndrome .req x7 +tmp1 .req x8 +tmp2 .req x9 +tmp3 .req x10 +zeroones .req x11 +pos .req x12 +limit_wd .req x13 +mask .req x14 +endloop .req x15 + +ENTRY(strncmp) + cbz limit, .Lret0 + eor tmp1, src1, src2 + mov zeroones, #REP8_01 + tst tmp1, #7 + b.ne .Lmisaligned8 + ands tmp1, src1, #7 + b.ne .Lmutual_align + /* Calculate the number of full and partial words -1. */ + /* + * when limit is mulitply of 8, if not sub 1, + * the judgement of last dword will wrong. + */ + sub limit_wd, limit, #1 /* limit != 0, so no underflow. */ + lsr limit_wd, limit_wd, #3 /* Convert to Dwords. */ + + /* + * NUL detection works on the principle that (X - 1) & (~X) & 0x80 + * (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and + * can be done in parallel across the entire word. + */ +.Lloop_aligned: + ldr data1, [src1], #8 + ldr data2, [src2], #8 +.Lstart_realigned: + subs limit_wd, limit_wd, #1 + sub tmp1, data1, zeroones + orr tmp2, data1, #REP8_7f + eor diff, data1, data2 /* Non-zero if differences found. */ + csinv endloop, diff, xzr, pl /* Last Dword or differences.*/ + bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ + ccmp endloop, #0, #0, eq + b.eq .Lloop_aligned + + /*Not reached the limit, must have found the end or a diff. */ + tbz limit_wd, #63, .Lnot_limit + + /* Limit % 8 == 0 => all bytes significant. */ + ands limit, limit, #7 + b.eq .Lnot_limit + + lsl limit, limit, #3 /* Bits -> bytes. */ + mov mask, #~0 +CPU_BE( lsr mask, mask, limit ) +CPU_LE( lsl mask, mask, limit ) + bic data1, data1, mask + bic data2, data2, mask + + /* Make sure that the NUL byte is marked in the syndrome. */ + orr has_nul, has_nul, mask + +.Lnot_limit: + orr syndrome, diff, has_nul + b .Lcal_cmpresult + +.Lmutual_align: + /* + * Sources are mutually aligned, but are not currently at an + * alignment boundary. Round down the addresses and then mask off + * the bytes that precede the start point. + * We also need to adjust the limit calculations, but without + * overflowing if the limit is near ULONG_MAX. + */ + bic src1, src1, #7 + bic src2, src2, #7 + ldr data1, [src1], #8 + neg tmp3, tmp1, lsl #3 /* 64 - bits(bytes beyond align). */ + ldr data2, [src2], #8 + mov tmp2, #~0 + sub limit_wd, limit, #1 /* limit != 0, so no underflow. */ + /* Big-endian. Early bytes are at MSB. */ +CPU_BE( lsl tmp2, tmp2, tmp3 ) /* Shift (tmp1 & 63). */ + /* Little-endian. Early bytes are at LSB. */ +CPU_LE( lsr tmp2, tmp2, tmp3 ) /* Shift (tmp1 & 63). */ + + and tmp3, limit_wd, #7 + lsr limit_wd, limit_wd, #3 + /* Adjust the limit. Only low 3 bits used, so overflow irrelevant.*/ + add limit, limit, tmp1 + add tmp3, tmp3, tmp1 + orr data1, data1, tmp2 + orr data2, data2, tmp2 + add limit_wd, limit_wd, tmp3, lsr #3 + b .Lstart_realigned + +/*when src1 offset is not equal to src2 offset...*/ +.Lmisaligned8: + cmp limit, #8 + b.lo .Ltiny8proc /*limit < 8... */ + /* + * Get the align offset length to compare per byte first. + * After this process, one string's address will be aligned.*/ + and tmp1, src1, #7 + neg tmp1, tmp1 + add tmp1, tmp1, #8 + and tmp2, src2, #7 + neg tmp2, tmp2 + add tmp2, tmp2, #8 + subs tmp3, tmp1, tmp2 + csel pos, tmp1, tmp2, hi /*Choose the maximum. */ + /* + * Here, limit is not less than 8, so directly run .Ltinycmp + * without checking the limit.*/ + sub limit, limit, pos +.Ltinycmp: + ldrb data1w, [src1], #1 + ldrb data2w, [src2], #1 + subs pos, pos, #1 + ccmp data1w, #1, #0, ne /* NZCV = 0b0000. */ + ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ + b.eq .Ltinycmp + cbnz pos, 1f /*find the null or unequal...*/ + cmp data1w, #1 + ccmp data1w, data2w, #0, cs + b.eq .Lstart_align /*the last bytes are equal....*/ +1: + sub result, data1, data2 + ret + +.Lstart_align: + lsr limit_wd, limit, #3 + cbz limit_wd, .Lremain8 + /*process more leading bytes to make str1 aligned...*/ + ands xzr, src1, #7 + b.eq .Lrecal_offset + add src1, src1, tmp3 /*tmp3 is positive in this branch.*/ + add src2, src2, tmp3 + ldr data1, [src1], #8 + ldr data2, [src2], #8 + + sub limit, limit, tmp3 + lsr limit_wd, limit, #3 + subs limit_wd, limit_wd, #1 + + sub tmp1, data1, zeroones + orr tmp2, data1, #REP8_7f + eor diff, data1, data2 /* Non-zero if differences found. */ + csinv endloop, diff, xzr, ne/*if limit_wd is 0,will finish the cmp*/ + bics has_nul, tmp1, tmp2 + ccmp endloop, #0, #0, eq /*has_null is ZERO: no null byte*/ + b.ne .Lunequal_proc + /*How far is the current str2 from the alignment boundary...*/ + and tmp3, tmp3, #7 +.Lrecal_offset: + neg pos, tmp3 +.Lloopcmp_proc: + /* + * Divide the eight bytes into two parts. First,backwards the src2 + * to an alignment boundary,load eight bytes from the SRC2 alignment + * boundary,then compare with the relative bytes from SRC1. + * If all 8 bytes are equal,then start the second part's comparison. + * Otherwise finish the comparison. + * This special handle can garantee all the accesses are in the + * thread/task space in avoid to overrange access. + */ + ldr data1, [src1,pos] + ldr data2, [src2,pos] + sub tmp1, data1, zeroones + orr tmp2, data1, #REP8_7f + bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ + eor diff, data1, data2 /* Non-zero if differences found. */ + csinv endloop, diff, xzr, eq + cbnz endloop, .Lunequal_proc + + /*The second part process*/ + ldr data1, [src1], #8 + ldr data2, [src2], #8 + subs limit_wd, limit_wd, #1 + sub tmp1, data1, zeroones + orr tmp2, data1, #REP8_7f + eor diff, data1, data2 /* Non-zero if differences found. */ + csinv endloop, diff, xzr, ne/*if limit_wd is 0,will finish the cmp*/ + bics has_nul, tmp1, tmp2 + ccmp endloop, #0, #0, eq /*has_null is ZERO: no null byte*/ + b.eq .Lloopcmp_proc + +.Lunequal_proc: + orr syndrome, diff, has_nul + cbz syndrome, .Lremain8 +.Lcal_cmpresult: + /* + * reversed the byte-order as big-endian,then CLZ can find the most + * significant zero bits. + */ +CPU_LE( rev syndrome, syndrome ) +CPU_LE( rev data1, data1 ) +CPU_LE( rev data2, data2 ) + /* + * For big-endian we cannot use the trick with the syndrome value + * as carry-propagation can corrupt the upper bits if the trailing + * bytes in the string contain 0x01. + * However, if there is no NUL byte in the dword, we can generate + * the result directly. We can't just subtract the bytes as the + * MSB might be significant. + */ +CPU_BE( cbnz has_nul, 1f ) +CPU_BE( cmp data1, data2 ) +CPU_BE( cset result, ne ) +CPU_BE( cneg result, result, lo ) +CPU_BE( ret ) +CPU_BE( 1: ) + /* Re-compute the NUL-byte detection, using a byte-reversed value.*/ +CPU_BE( rev tmp3, data1 ) +CPU_BE( sub tmp1, tmp3, zeroones ) +CPU_BE( orr tmp2, tmp3, #REP8_7f ) +CPU_BE( bic has_nul, tmp1, tmp2 ) +CPU_BE( rev has_nul, has_nul ) +CPU_BE( orr syndrome, diff, has_nul ) + /* + * The MS-non-zero bit of the syndrome marks either the first bit + * that is different, or the top bit of the first zero byte. + * Shifting left now will bring the critical information into the + * top bits. + */ + clz pos, syndrome + lsl data1, data1, pos + lsl data2, data2, pos + /* + * But we need to zero-extend (char is unsigned) the value and then + * perform a signed 32-bit subtraction. + */ + lsr data1, data1, #56 + sub result, data1, data2, lsr #56 + ret + +.Lremain8: + /* Limit % 8 == 0 => all bytes significant. */ + ands limit, limit, #7 + b.eq .Lret0 +.Ltiny8proc: + ldrb data1w, [src1], #1 + ldrb data2w, [src2], #1 + subs limit, limit, #1 + + ccmp data1w, #1, #0, ne /* NZCV = 0b0000. */ + ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ + b.eq .Ltiny8proc + sub result, data1, data2 + ret + +.Lret0: + mov result, #0 + ret +ENDPROC(strncmp) diff --git a/xen/arch/arm/arm64/lib/strnlen.S b/xen/arch/arm/arm64/lib/strnlen.S new file mode 100644 index 0000000..8aa5bbf --- /dev/null +++ b/xen/arch/arm/arm64/lib/strnlen.S @@ -0,0 +1,172 @@ +/* + * Copyright (C) 2013 ARM Ltd. + * Copyright (C) 2013 Linaro. + * + * This code is based on glibc cortex strings work originally authored by Linaro + * and re-licensed under GPLv2 for the Linux kernel. The original code can + * be found @ + * + * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ + * files/head:/src/aarch64/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include + +#include "assembler.h" + +/* + * determine the length of a fixed-size string + * + * Parameters: + * x0 - const string pointer + * x1 - maximal string length + * Returns: + * x0 - the return length of specific string + */ + +/* Arguments and results. */ +srcin .req x0 +len .req x0 +limit .req x1 + +/* Locals and temporaries. */ +src .req x2 +data1 .req x3 +data2 .req x4 +data2a .req x5 +has_nul1 .req x6 +has_nul2 .req x7 +tmp1 .req x8 +tmp2 .req x9 +tmp3 .req x10 +tmp4 .req x11 +zeroones .req x12 +pos .req x13 +limit_wd .req x14 + +#define REP8_01 0x0101010101010101 +#define REP8_7f 0x7f7f7f7f7f7f7f7f +#define REP8_80 0x8080808080808080 + +ENTRY(strnlen) + cbz limit, .Lhit_limit + mov zeroones, #REP8_01 + bic src, srcin, #15 + ands tmp1, srcin, #15 + b.ne .Lmisaligned + /* Calculate the number of full and partial words -1. */ + sub limit_wd, limit, #1 /* Limit != 0, so no underflow. */ + lsr limit_wd, limit_wd, #4 /* Convert to Qwords. */ + + /* + * NUL detection works on the principle that (X - 1) & (~X) & 0x80 + * (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and + * can be done in parallel across the entire word. + */ + /* + * The inner loop deals with two Dwords at a time. This has a + * slightly higher start-up cost, but we should win quite quickly, + * especially on cores with a high number of issue slots per + * cycle, as we get much better parallelism out of the operations. + */ +.Lloop: + ldp data1, data2, [src], #16 +.Lrealigned: + sub tmp1, data1, zeroones + orr tmp2, data1, #REP8_7f + sub tmp3, data2, zeroones + orr tmp4, data2, #REP8_7f + bic has_nul1, tmp1, tmp2 + bic has_nul2, tmp3, tmp4 + subs limit_wd, limit_wd, #1 + orr tmp1, has_nul1, has_nul2 + ccmp tmp1, #0, #0, pl /* NZCV = 0000 */ + b.eq .Lloop + + cbz tmp1, .Lhit_limit /* No null in final Qword. */ + + /* + * We know there's a null in the final Qword. The easiest thing + * to do now is work out the length of the string and return + * MIN (len, limit). + */ + sub len, src, srcin + cbz has_nul1, .Lnul_in_data2 +CPU_BE( mov data2, data1 ) /*perpare data to re-calculate the syndrome*/ + + sub len, len, #8 + mov has_nul2, has_nul1 +.Lnul_in_data2: + /* + * For big-endian, carry propagation (if the final byte in the + * string is 0x01) means we cannot use has_nul directly. The + * easiest way to get the correct byte is to byte-swap the data + * and calculate the syndrome a second time. + */ +CPU_BE( rev data2, data2 ) +CPU_BE( sub tmp1, data2, zeroones ) +CPU_BE( orr tmp2, data2, #REP8_7f ) +CPU_BE( bic has_nul2, tmp1, tmp2 ) + + sub len, len, #8 + rev has_nul2, has_nul2 + clz pos, has_nul2 + add len, len, pos, lsr #3 /* Bits to bytes. */ + cmp len, limit + csel len, len, limit, ls /* Return the lower value. */ + ret + +.Lmisaligned: + /* + * Deal with a partial first word. + * We're doing two things in parallel here; + * 1) Calculate the number of words (but avoiding overflow if + * limit is near ULONG_MAX) - to do this we need to work out + * limit + tmp1 - 1 as a 65-bit value before shifting it; + * 2) Load and mask the initial data words - we force the bytes + * before the ones we are interested in to 0xff - this ensures + * early bytes will not hit any zero detection. + */ + ldp data1, data2, [src], #16 + + sub limit_wd, limit, #1 + and tmp3, limit_wd, #15 + lsr limit_wd, limit_wd, #4 + + add tmp3, tmp3, tmp1 + add limit_wd, limit_wd, tmp3, lsr #4 + + neg tmp4, tmp1 + lsl tmp4, tmp4, #3 /* Bytes beyond alignment -> bits. */ + + mov tmp2, #~0 + /* Big-endian. Early bytes are at MSB. */ +CPU_BE( lsl tmp2, tmp2, tmp4 ) /* Shift (tmp1 & 63). */ + /* Little-endian. Early bytes are at LSB. */ +CPU_LE( lsr tmp2, tmp2, tmp4 ) /* Shift (tmp1 & 63). */ + + cmp tmp1, #8 + + orr data1, data1, tmp2 + orr data2a, data2, tmp2 + + csinv data1, data1, xzr, le + csel data2, data2, data2a, le + b .Lrealigned + +.Lhit_limit: + mov len, limit + ret +ENDPROC(strnlen) diff --git a/xen/include/asm-arm/string.h b/xen/include/asm-arm/string.h index dfad1fe..e4b4469 100644 --- a/xen/include/asm-arm/string.h +++ b/xen/include/asm-arm/string.h @@ -14,6 +14,20 @@ extern char * strrchr(const char * s, int c); #define __HAVE_ARCH_STRCHR extern char * strchr(const char * s, int c); +#if defined(CONFIG_ARM_64) +#define __HAVE_ARCH_STRCMP +extern int strcmp(const char *, const char *); + +#define __HAVE_ARCH_STRNCMP +extern int strncmp(const char *, const char *, __kernel_size_t); + +#define __HAVE_ARCH_STRLEN +extern __kernel_size_t strlen(const char *); + +#define __HAVE_ARCH_STRNLEN +extern __kernel_size_t strnlen(const char *, __kernel_size_t); +#endif + #define __HAVE_ARCH_MEMCPY extern void * memcpy(void *, const void *, __kernel_size_t);