[v2,3/4] crypto: x86/sha256-ni - optimize code size

Message ID	20240411162359.39073-4-ebiggers@kernel.org
State	Superseded
Headers	show Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 877EA3B182; Thu, 11 Apr 2024 16:25:45 +0000 (UTC) From: Eric Biggers <ebiggers@kernel.org> To: linux-crypto@vger.kernel.org Cc: linux-kernel@vger.kernel.org, Stefan Kanthak <stefan.kanthak@nexgo.de> Subject: [PATCH v2 3/4] crypto: x86/sha256-ni - optimize code size Date: Thu, 11 Apr 2024 09:23:58 -0700 Message-ID: <20240411162359.39073-4-ebiggers@kernel.org> In-Reply-To: <20240411162359.39073-1-ebiggers@kernel.org> References: <20240411162359.39073-1-ebiggers@kernel.org> Precedence: bulk MIME-Version: 1.0 Content-Transfer-Encoding: 8bit
Series	crypto: x86/sha256-ni - cleanup and optimization \| expand [v2,0/4] crypto: x86/sha256-ni - cleanup and optimization [v2,1/4] crypto: x86/sha256-ni - convert to use rounds macros [v2,2/4] crypto: x86/sha256-ni - rename some register aliases [v2,3/4] crypto: x86/sha256-ni - optimize code size [v2,4/4] crypto: x86/sha256-ni - simplify do_4rounds

Message ID

20240411162359.39073-4-ebiggers@kernel.org

State

Superseded

Headers

From: Eric Biggers <ebiggers@kernel.org>
To: linux-crypto@vger.kernel.org
Cc: linux-kernel@vger.kernel.org,
	Stefan Kanthak <stefan.kanthak@nexgo.de>
Subject: [PATCH v2 3/4] crypto: x86/sha256-ni - optimize code size
Date: Thu, 11 Apr 2024 09:23:58 -0700
Message-ID: <20240411162359.39073-4-ebiggers@kernel.org>
In-Reply-To: <20240411162359.39073-1-ebiggers@kernel.org>
References: <20240411162359.39073-1-ebiggers@kernel.org>
Precedence: bulk
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit

Series

crypto: x86/sha256-ni - cleanup and optimization | expand

Commit Message

Eric Biggers April 11, 2024, 4:23 p.m. UTC

From: Eric Biggers <ebiggers@google.com>

- Load the SHA-256 round constants relative to a pointer that points
  into the middle of the constants rather than to the beginning.  Since
  x86 instructions use signed offsets, this decreases the instruction
  length required to access some of the later round constants.

- Use punpcklqdq or punpckhqdq instead of longer instructions such as
  pshufd, pblendw, and palignr.  This doesn't harm performance.

The end result is that sha256_ni_transform shrinks from 839 bytes to 791
bytes, with no loss in performance.

Suggested-by: Stefan Kanthak <stefan.kanthak@nexgo.de>
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 arch/x86/crypto/sha256_ni_asm.S | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/arch/x86/crypto/sha256_ni_asm.S b/arch/x86/crypto/sha256_ni_asm.S
index b7e7001dafdf..ffc9f1c75c15 100644
--- a/arch/x86/crypto/sha256_ni_asm.S
+++ b/arch/x86/crypto/sha256_ni_asm.S
@@ -82,19 +82,19 @@ 
 	pshufb		SHUF_MASK, MSG
 	movdqa		MSG, \m0
 .else
 	movdqa		\m0, MSG
 .endif
-	paddd		\i*4(SHA256CONSTANTS), MSG
+	paddd		(\i-32)*4(SHA256CONSTANTS), MSG
 	sha256rnds2	STATE0, STATE1
 .if \i >= 12 && \i < 60
 	movdqa		\m0, TMP
 	palignr		$4, \m3, TMP
 	paddd		TMP, \m1
 	sha256msg2	\m0, \m1
 .endif
-	pshufd 		$0x0E, MSG, MSG
+	punpckhqdq	MSG, MSG
 	sha256rnds2	STATE1, STATE0
 .if \i >= 4 && \i < 52
 	sha256msg1	\m0, \m3
 .endif
 .endm
@@ -126,21 +126,21 @@  SYM_TYPED_FUNC_START(sha256_ni_transform)
 	/*
 	 * load initial hash values
 	 * Need to reorder these appropriately
 	 * DCBA, HGFE -> ABEF, CDGH
 	 */
-	movdqu		0*16(DIGEST_PTR), STATE0
-	movdqu		1*16(DIGEST_PTR), STATE1
+	movdqu		0*16(DIGEST_PTR), STATE0	/* DCBA */
+	movdqu		1*16(DIGEST_PTR), STATE1	/* HGFE */
 
-	pshufd		$0xB1, STATE0,  STATE0		/* CDAB */
-	pshufd		$0x1B, STATE1,  STATE1		/* EFGH */
 	movdqa		STATE0, TMP
-	palignr		$8, STATE1,  STATE0		/* ABEF */
-	pblendw		$0xF0, TMP, STATE1		/* CDGH */
+	punpcklqdq	STATE1, STATE0			/* FEBA */
+	punpckhqdq	TMP, STATE1			/* DCHG */
+	pshufd		$0x1B, STATE0, STATE0		/* ABEF */
+	pshufd		$0xB1, STATE1, STATE1		/* CDGH */
 
 	movdqa		PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK
-	lea		K256(%rip), SHA256CONSTANTS
+	lea		K256+32*4(%rip), SHA256CONSTANTS
 
 .Lloop0:
 	/* Save hash values for addition after rounds */
 	movdqa		STATE0, ABEF_SAVE
 	movdqa		STATE1, CDGH_SAVE
@@ -160,18 +160,18 @@  SYM_TYPED_FUNC_START(sha256_ni_transform)
 	add		$64, DATA_PTR
 	cmp		NUM_BLKS, DATA_PTR
 	jne		.Lloop0
 
 	/* Write hash values back in the correct order */
-	pshufd		$0x1B, STATE0,  STATE0		/* FEBA */
-	pshufd		$0xB1, STATE1,  STATE1		/* DCHG */
 	movdqa		STATE0, TMP
-	pblendw		$0xF0, STATE1,  STATE0		/* DCBA */
-	palignr		$8, TMP, STATE1			/* HGFE */
+	punpcklqdq	STATE1, STATE0			/* GHEF */
+	punpckhqdq	TMP, STATE1			/* ABCD */
+	pshufd		$0xB1, STATE0, STATE0		/* HGFE */
+	pshufd		$0x1B, STATE1, STATE1		/* DCBA */
 
-	movdqu		STATE0, 0*16(DIGEST_PTR)
-	movdqu		STATE1, 1*16(DIGEST_PTR)
+	movdqu		STATE1, 0*16(DIGEST_PTR)
+	movdqu		STATE0, 1*16(DIGEST_PTR)
 
 .Ldone_hash:
 
 	RET
 SYM_FUNC_END(sha256_ni_transform)

[v2,3/4] crypto: x86/sha256-ni - optimize code size

Commit Message

Patch