diff mbox series

[RFC] crypto: riscv: scalar accelerated GHASH

Message ID 20250417064940.68469-1-dqfext@gmail.com
State New
Headers show
Series [RFC] crypto: riscv: scalar accelerated GHASH | expand

Commit Message

Qingfang Deng April 17, 2025, 6:49 a.m. UTC
From: Qingfang Deng <qingfang.deng@siflower.com.cn>

Add a scalar implementation of GHASH for RISC-V using the Zbc (carry-less
multiplication) and Zbb (bit-manipulation) extensions. This implementation
is adapted from OpenSSL but rewritten in plain C for clarity.

Unlike the OpenSSL one that rely on bit-reflection of the data, this
version uses a pre-computed (reflected and multiplied) key, inspired by
the approach used in Intel's CLMUL driver, to avoid reflections during
runtime.

Signed-off-by: Qingfang Deng <qingfang.deng@siflower.com.cn>
---
 arch/riscv/crypto/Kconfig               |  16 +-
 arch/riscv/crypto/Makefile              |   2 +
 arch/riscv/crypto/ghash-riscv64-clmul.c | 270 ++++++++++++++++++++++++
 3 files changed, 287 insertions(+), 1 deletion(-)
 create mode 100644 arch/riscv/crypto/ghash-riscv64-clmul.c

Comments

Qingfang Deng April 18, 2025, 2:49 a.m. UTC | #1
Hi Eric,

On Fri, Apr 18, 2025 at 1:09 AM Eric Biggers <ebiggers@kernel.org> wrote:
>
> Please help properly optimize swab*() and {get,put}_unaligned_* for RISC-V
> first, before considering random hacks like this.
>
> https://lore.kernel.org/r/20250403-riscv-swab-v3-0-3bf705d80e33@iencinas.com
> is working on swab*().

Indeed — in fact, our downstream NONPORTABLE version currently uses
{get,put}_unaligned_be64, as we've modified the Makefile to ensure the
compiler optimizes for both unaligned access and efficient swab*()
handling.

>
> > +             /* Multiplication (without Karatsuba) */
> > +             t0 = clmul128(p_lo, k_lo);
> > +             t1 = clmul128(p_lo, k_hi);
> > +             t2 = clmul128(p_hi, k_lo);
> > +             t3 = clmul128(p_hi, k_hi);
> > +             mid = t1 ^ t2;
> > +             lo = t0 ^ (mid << 64);
> > +             hi = t3 ^ (mid >> 64);
>
> There is no need to explicitly XOR 'mid << 64' into lo and 'mid >> 64' into hi.
> Take a look at how arch/x86/crypto/aes-gcm-*.S do it.

Thanks, I saw your comments in aes-gcm-avx10-x86_64.S and now
understand what you meant.

However, since we're working with 64-bit scalar registers on RISC-V
(as opposed to 128-bit SIMD registers on x86), there's no reduction in
the number of XOR instructions. Regardless of whether we explicitly
compute mid and shift it, or directly XOR the intermediate results, we
still end up with 8 individual 64-bit XORs to combine t0, t1, t2, and
t3.

So while the optimization helps on x86 due to wider registers and
vector instructions, it doesn't offer a benefit in our scalar RISC-V
implementation.

>
> Also, since this is only doing one block at a time and does not use Karatsuba
> multiplication, the single-step reduction would work well here.  See
> aes-gcm-aesni-x86_64.S.

I saw the pre-compute key step. Is it the same as the step mentioned
on page 12 of this PDF?

[1] https://builders.intel.com/docs/networkbuilders/advanced-encryption-standard-galois-counter-mode-optimized-ghash-function-technology-guide-1693300747.pdf

>
> - Eric
diff mbox series

Patch

diff --git a/arch/riscv/crypto/Kconfig b/arch/riscv/crypto/Kconfig
index 6392e1e11bc9..03b74d4116cb 100644
--- a/arch/riscv/crypto/Kconfig
+++ b/arch/riscv/crypto/Kconfig
@@ -26,7 +26,7 @@  config CRYPTO_CHACHA_RISCV64
 	default CRYPTO_LIB_CHACHA_INTERNAL
 
 config CRYPTO_GHASH_RISCV64
-	tristate "Hash functions: GHASH"
+	tristate "Hash functions: GHASH (vector accelarated)"
 	depends on 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO
 	select CRYPTO_GCM
 	help
@@ -35,6 +35,20 @@  config CRYPTO_GHASH_RISCV64
 	  Architecture: riscv64 using:
 	  - Zvkg vector crypto extension
 
+config CRYPTO_GHASH_RISCV64_CLMUL
+	tristate "Hash functions: GHASH (CLMUL scalar accelerated)"
+	depends on 64BIT && TOOLCHAIN_HAS_ZBB && TOOLCHAIN_HAS_ZBC
+	select CRYPTO_GCM
+	help
+	  GCM GHASH function (NIST SP 800-38D)
+
+	  Architecture: riscv64 using:
+	  - Zbb Bitmanipulation extension
+	  - Zbc Carry-less multiplication
+	    OR
+	  - Zbkb Bit-manipulation for Cryptography
+	  - Zbkc Carry-less multiplication for Cryptography
+
 config CRYPTO_SHA256_RISCV64
 	tristate "Hash functions: SHA-224 and SHA-256"
 	depends on 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO
diff --git a/arch/riscv/crypto/Makefile b/arch/riscv/crypto/Makefile
index 247c7bc7288c..b5dc497d398c 100644
--- a/arch/riscv/crypto/Makefile
+++ b/arch/riscv/crypto/Makefile
@@ -10,6 +10,8 @@  chacha-riscv64-y := chacha-riscv64-glue.o chacha-riscv64-zvkb.o
 obj-$(CONFIG_CRYPTO_GHASH_RISCV64) += ghash-riscv64.o
 ghash-riscv64-y := ghash-riscv64-glue.o ghash-riscv64-zvkg.o
 
+obj-$(CONFIG_CRYPTO_GHASH_RISCV64_CLMUL) += ghash-riscv64-clmul.o
+
 obj-$(CONFIG_CRYPTO_SHA256_RISCV64) += sha256-riscv64.o
 sha256-riscv64-y := sha256-riscv64-glue.o sha256-riscv64-zvknha_or_zvknhb-zvkb.o
 
diff --git a/arch/riscv/crypto/ghash-riscv64-clmul.c b/arch/riscv/crypto/ghash-riscv64-clmul.c
new file mode 100644
index 000000000000..4777aa8e94cb
--- /dev/null
+++ b/arch/riscv/crypto/ghash-riscv64-clmul.c
@@ -0,0 +1,270 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * GHASH using the RISC-V Zbc/Zbkc (CLMUL) extension
+ *
+ * Copyright (C) 2023 VRULL GmbH
+ * Author: Christoph Müllner <christoph.muellner@vrull.eu>
+ *
+ * Copyright (C) 2025 Siflower Communications Ltd
+ * Author: Qingfang Deng <qingfang.deng@siflower.com.cn>
+ */
+
+#include <linux/crypto.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <crypto/ghash.h>
+#include <crypto/internal/hash.h>
+
+#define GHASH_MOD_POLY	0xc200000000000000
+
+struct riscv64_clmul_ghash_ctx {
+	__uint128_t key;
+};
+
+struct riscv64_clmul_ghash_desc_ctx {
+	__uint128_t shash;
+	u8 buffer[GHASH_DIGEST_SIZE];
+	int bytes;
+};
+
+static __always_inline u64 riscv_zbb_swab64(u64 val)
+{
+	asm (".option push\n"
+	     ".option arch,+zbb\n"
+	     "rev8 %0, %1\n"
+	     ".option pop\n"
+	     : "=r" (val) : "r" (val));
+	return val;
+}
+
+static __always_inline __uint128_t get_unaligned_be128(const u8 *p)
+{
+	__uint128_t val;
+#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+	val = *(__uint128_t *)p;
+	val = riscv_zbb_swab64(val >> 64) | (__uint128_t)riscv_zbb_swab64(val) << 64;
+#else
+	val = (__uint128_t)p[0] << 120;
+	val |= (__uint128_t)p[1] << 112;
+	val |= (__uint128_t)p[2] << 104;
+	val |= (__uint128_t)p[3] << 96;
+	val |= (__uint128_t)p[4] << 88;
+	val |= (__uint128_t)p[5] << 80;
+	val |= (__uint128_t)p[6] << 72;
+	val |= (__uint128_t)p[7] << 64;
+	val |= (__uint128_t)p[8] << 56;
+	val |= (__uint128_t)p[9] << 48;
+	val |= (__uint128_t)p[10] << 40;
+	val |= (__uint128_t)p[11] << 32;
+	val |= (__uint128_t)p[12] << 24;
+	val |= (__uint128_t)p[13] << 16;
+	val |= (__uint128_t)p[14] << 8;
+	val |= (__uint128_t)p[15];
+#endif
+	return val;
+}
+
+static __always_inline void put_unaligned_be128(__uint128_t val, u8 *p)
+{
+#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+	*(__uint128_t *)p = riscv_zbb_swab64(val >> 64) | (__uint128_t)riscv_zbb_swab64(val) << 64;
+#else
+	p[0] = val >> 120;
+	p[1] = val >> 112;
+	p[2] = val >> 104;
+	p[3] = val >> 96;
+	p[4] = val >> 88;
+	p[5] = val >> 80;
+	p[6] = val >> 72;
+	p[7] = val >> 64;
+	p[8] = val >> 56;
+	p[9] = val >> 48;
+	p[10] = val >> 40;
+	p[11] = val >> 32;
+	p[12] = val >> 24;
+	p[13] = val >> 16;
+	p[14] = val >> 8;
+	p[15] = val;
+#endif
+}
+
+static __always_inline __attribute_const__
+__uint128_t clmul128(u64 a, u64 b)
+{
+	u64 hi, lo;
+
+	asm(".option push\n"
+	    ".option arch,+zbc\n"
+	    "clmul	%0, %2, %3\n"
+	    "clmulh	%1, %2, %3\n"
+	    ".option pop\n"
+	    : "=&r" (lo), "=&r" (hi) : "r" (a), "r" (b));
+	return (__uint128_t)hi << 64 | lo;
+}
+
+static int riscv64_clmul_ghash_init(struct shash_desc *desc)
+{
+	struct riscv64_clmul_ghash_desc_ctx *dctx = shash_desc_ctx(desc);
+
+	dctx->bytes = 0;
+	dctx->shash = 0;
+	return 0;
+}
+
+/* Compute GMULT (Xi*H mod f) using the Zbc (clmul) extensions.
+ * Using the no-Karatsuba approach and clmul for the final reduction.
+ * This results in an implementation with minimized number of instructions.
+ * HW with clmul latencies higher than 2 cycles might observe a performance
+ * improvement with Karatsuba. HW with clmul latencies higher than 6 cycles
+ * might observe a performance improvement with additionally converting the
+ * reduction to shift&xor. For a full discussion of this estimates see
+ * https://github.com/riscv/riscv-crypto/blob/master/doc/supp/gcm-mode-cmul.adoc
+ */
+static void gcm_ghash_rv64i_zbc(__uint128_t *Xi, __uint128_t k, const u8 *inp, size_t len)
+{
+	u64 k_hi = k >> 64, k_lo = k, p_hi, p_lo;
+	__uint128_t hash = *Xi, p;
+
+	do {
+		__uint128_t t0, t1, t2, t3, lo, mid, hi;
+
+		/* Load the input data, byte-reverse them, and XOR them with Xi */
+		p = get_unaligned_be128(inp);
+
+		inp += GHASH_BLOCK_SIZE;
+		len -= GHASH_BLOCK_SIZE;
+
+		p ^= hash;
+		p_hi = p >> 64;
+		p_lo = p;
+
+		/* Multiplication (without Karatsuba) */
+		t0 = clmul128(p_lo, k_lo);
+		t1 = clmul128(p_lo, k_hi);
+		t2 = clmul128(p_hi, k_lo);
+		t3 = clmul128(p_hi, k_hi);
+		mid = t1 ^ t2;
+		lo = t0 ^ (mid << 64);
+		hi = t3 ^ (mid >> 64);
+
+		/* Reduction with clmul */
+		mid = clmul128(lo, GHASH_MOD_POLY);
+		lo ^= mid << 64;
+		hi ^= lo ^ (mid >> 64);
+		hi ^= clmul128(lo >> 64, GHASH_MOD_POLY);
+		hash = hi;
+	} while (len);
+
+	*Xi = hash;
+}
+
+static int riscv64_clmul_ghash_setkey(struct crypto_shash *tfm, const u8 *key, unsigned int keylen)
+{
+	struct riscv64_clmul_ghash_ctx *ctx = crypto_shash_ctx(tfm);
+	__uint128_t k;
+
+	if (keylen != GHASH_BLOCK_SIZE)
+		return -EINVAL;
+
+	k = get_unaligned_be128(key);
+	k = (k << 1 | k >> 127) ^ (k >> 127 ? (__uint128_t)GHASH_MOD_POLY << 64 : 0);
+	ctx->key = k;
+
+	return 0;
+}
+
+static int riscv64_clmul_ghash_update(struct shash_desc *desc, const u8 *src, unsigned int srclen)
+{
+	struct riscv64_clmul_ghash_ctx *ctx = crypto_shash_ctx(desc->tfm);
+	struct riscv64_clmul_ghash_desc_ctx *dctx = shash_desc_ctx(desc);
+	unsigned int len;
+
+	if (dctx->bytes) {
+		if (dctx->bytes + srclen < GHASH_DIGEST_SIZE) {
+			memcpy(dctx->buffer + dctx->bytes, src, srclen);
+			dctx->bytes += srclen;
+			return 0;
+		}
+		memcpy(dctx->buffer + dctx->bytes, src, GHASH_DIGEST_SIZE - dctx->bytes);
+
+		gcm_ghash_rv64i_zbc(&dctx->shash, ctx->key, dctx->buffer, GHASH_DIGEST_SIZE);
+
+		src += GHASH_DIGEST_SIZE - dctx->bytes;
+		srclen -= GHASH_DIGEST_SIZE - dctx->bytes;
+		dctx->bytes = 0;
+	}
+
+	len = round_down(srclen, GHASH_BLOCK_SIZE);
+	if (len) {
+		gcm_ghash_rv64i_zbc(&dctx->shash, ctx->key, src, len);
+		src += len;
+		srclen -= len;
+	}
+
+	if (srclen) {
+		memcpy(dctx->buffer, src, srclen);
+		dctx->bytes = srclen;
+	}
+	return 0;
+}
+
+static int riscv64_clmul_ghash_final(struct shash_desc *desc, u8 out[GHASH_DIGEST_SIZE])
+{
+	struct riscv64_clmul_ghash_ctx *ctx = crypto_shash_ctx(desc->tfm);
+	struct riscv64_clmul_ghash_desc_ctx *dctx = shash_desc_ctx(desc);
+	int i;
+
+	if (dctx->bytes) {
+		for (i = dctx->bytes; i < GHASH_DIGEST_SIZE; i++)
+			dctx->buffer[i] = 0;
+		gcm_ghash_rv64i_zbc(&dctx->shash, ctx->key, dctx->buffer, GHASH_DIGEST_SIZE);
+		dctx->bytes = 0;
+	}
+	put_unaligned_be128(dctx->shash, out);
+	return 0;
+}
+
+struct shash_alg riscv64_clmul_ghash_alg = {
+	.init = riscv64_clmul_ghash_init,
+	.update = riscv64_clmul_ghash_update,
+	.final = riscv64_clmul_ghash_final,
+	.setkey = riscv64_clmul_ghash_setkey,
+	.descsize = sizeof(struct riscv64_clmul_ghash_desc_ctx),
+	.digestsize = GHASH_DIGEST_SIZE,
+	.base = {
+		 .cra_blocksize = GHASH_BLOCK_SIZE,
+		 .cra_ctxsize = sizeof(struct riscv64_clmul_ghash_ctx),
+		 .cra_priority = 250,
+		 .cra_name = "ghash",
+		 .cra_driver_name = "ghash-riscv64-clmul",
+		 .cra_module = THIS_MODULE,
+	},
+};
+
+static int __init riscv64_clmul_ghash_mod_init(void)
+{
+	bool has_clmul, has_rev8;
+
+	has_clmul = riscv_isa_extension_available(NULL, ZBC) ||
+		    riscv_isa_extension_available(NULL, ZBKC);
+	has_rev8 = riscv_isa_extension_available(NULL, ZBB) ||
+		   riscv_isa_extension_available(NULL, ZBKB);
+	if (has_clmul && has_rev8)
+		return crypto_register_shash(&riscv64_clmul_ghash_alg);
+
+	return -ENODEV;
+}
+
+static void __exit riscv64_clmul_ghash_mod_fini(void)
+{
+	crypto_unregister_shash(&riscv64_clmul_ghash_alg);
+}
+
+module_init(riscv64_clmul_ghash_mod_init);
+module_exit(riscv64_clmul_ghash_mod_fini);
+
+MODULE_DESCRIPTION("GHASH (RISC-V CLMUL accelerated)");
+MODULE_AUTHOR("Qingfang Deng <dqfext@gmail.com>");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_CRYPTO("ghash");