diff mbox

[RFC,1/6] crypto: sha512: implement base layer for SHA-512

Message ID 1427580628-7128-2-git-send-email-ard.biesheuvel@linaro.org
State New
Headers show

Commit Message

Ard Biesheuvel March 28, 2015, 10:10 p.m. UTC
To reduce the number of copies of boilerplate code throughout
the tree, this patch implements generic glue for the SHA-512
algorithm. This allows a specific arch or hardware implementation
to only implement the special handling that it needs.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
 crypto/Kconfig       |   3 ++
 crypto/Makefile      |   1 +
 crypto/sha512_base.c | 143 +++++++++++++++++++++++++++++++++++++++++++++++++++
 include/crypto/sha.h |  20 +++++++
 4 files changed, 167 insertions(+)
 create mode 100644 crypto/sha512_base.c

Comments

Ard Biesheuvel March 29, 2015, 10:38 a.m. UTC | #1
On 29 March 2015 at 10:29, Markus Stockhausen <stockhausen@collogia.de> wrote:
>> Von: linux-crypto-owner@vger.kernel.org [linux-crypto-owner@vger.kernel.org]&quot; im Auftrag von &quot;Ard Biesheuvel [ard.biesheuvel@linaro.org]
>> Gesendet: Samstag, 28. März 2015 23:10
>> An: linux-arm-kernel@lists.infradead.org; linux-crypto@vger.kernel.org; samitolvanen@google.com; herbert@gondor.apana.org.au; jussi.kivilinna@iki.fi
>> Cc: Ard Biesheuvel
>> Betreff: [RFC PATCH 1/6] crypto: sha512: implement base layer for SHA-512
>>
>> To reduce the number of copies of boilerplate code throughout
>> the tree, this patch implements generic glue for the SHA-512
>> algorithm. This allows a specific arch or hardware implementation
>> to only implement the special handling that it needs.
>
> Hi Ard,
>
> Implementing a common layer is a very good idea - I didn't like to
> implement the glue code once again for some recently developed
> PPC crypto modules. From my very short crypto experience I was
> surprised that my optimized implementations degraded disproportional
> for small calculations in the <=256byte update scenarios in contrast to
> some very old basic implementations. Below you will find some hints,
> that might fit your implementation too. Thus all new implementations
> based on your framework could benefit immediately.
>

Thanks for taking a look!

>> ...
>> +int sha384_base_init(struct shash_desc *desc)
>> +{
>> +       struct sha512_state *sctx = shash_desc_ctx(desc);
>> +
>> +       *sctx = (struct sha512_state){
>> +               .state = {
>> +                       SHA384_H0, SHA384_H1, SHA384_H2, SHA384_H3,
>> +                       SHA384_H4, SHA384_H5, SHA384_H6, SHA384_H7,
>> +               }
>> +       };
>> +       return 0;
>> +}
>
> IIRC the above code will initialize the whole context including the 64/128
> byte buffer. Direct assignment of the 8 hashes was faster in my case.
>

Ah, I missed that. I will change it.

>> ...
>> +int sha512_base_do_update(struct shash_desc *desc, const u8 *data,
>> +                         unsigned int len, sha512_block_fn *block_fn, void *p)
>> +{
>> +       struct sha512_state *sctx = shash_desc_ctx(desc);
>> +       unsigned int partial = sctx->count[0] % SHA512_BLOCK_SIZE;
>> +
>> +       sctx->count[0] += len;
>> +       if (sctx->count[0] < len)
>> +               sctx->count[1]++;
>
> You should check if early kick out at this point if the buffer won't be filled up
> is faster than first taking care about big data. That can improve performance
> for small blocks while large blocks might be unaffected.
>
>> +
>> +       if ((partial + len) >= SHA512_BLOCK_SIZE) {

Isn't this early kickout? The if is only entered if there is enough
data to run the block function, otherwise it is a straight memcpy. I
could add an unlikely() here to favor the small data case


>> +               int blocks;
>> +
>> +               if (partial) {
>> +                       int p = SHA512_BLOCK_SIZE - partial;
>> +
>> +                       memcpy(sctx->buf + partial, data, p);
>> +                       data += p;
>> +                       len -= p;
>> +               }
>> +
>> +               blocks = len / SHA512_BLOCK_SIZE;
>> +               len %= SHA512_BLOCK_SIZE;
>> +
>> +               block_fn(blocks, data, sctx->state,
>> +                        partial ? sctx->buf : NULL, p);
>> +               data += blocks * SHA512_BLOCK_SIZE;
>> +               partial = 0;
>> +       }
>> +       if (len)
>> +               memcpy(sctx->buf + partial, data, len);
>> +
>> +       return 0;
>> +}
>> +EXPORT_SYMBOL(sha512_base_do_update);
>> +
>> +int sha512_base_do_finalize(struct shash_desc *desc, sha512_block_fn *block_fn,
>> +                           void *p)
>> +{
>> +       static const u8 padding[SHA512_BLOCK_SIZE] = { 0x80, };
>> +
>> +       struct sha512_state *sctx = shash_desc_ctx(desc);
>> +       unsigned int padlen;
>> +       __be64 bits[2];
>> +
>> +       padlen = SHA512_BLOCK_SIZE -
>> +                (sctx->count[0] + sizeof(bits)) % SHA512_BLOCK_SIZE;
>> +
>> +       bits[0] = cpu_to_be64(sctx->count[1] << 3 |
>> +                             sctx->count[0] >> 61);
>> +       bits[1] = cpu_to_be64(sctx->count[0] << 3);
>> +
>> +       sha512_base_do_update(desc, padding, padlen, block_fn, p);
>
> I know that this is the most intuitive and straight implementation for handling
> finalization. Nevertheless the maybe a little obscure generic md5 algorithm
> gives best in class performance for hash finalization of small input data.
>

Well, memcpy'ing a buffer consisting almost entirely of zeroes doesn't
quite feel right, indeed.
I will instead follow the md5 suggestion

> For comparison: From the raw numbers the sha1-ppc-spe assembler module
> written by me is only 10% faster than the old sha1-popwerpc assembler module.
> Both are simple assembler algorithms without hardware acceleration. For large
> blocks I gain another 8% by avoding function calls because the core module
> may process several blocks. But for small single block updates the above glue
> code optimizations gave
>
> 16byte block single update: +24%
> 64byte block single update: +16%
> 256byte block single update +12%
>
> Considering CPU assisted SHA calculations that percentage may be even higher.
>
> Maybe worth the effort ...
>

Absolutely!

Thanks again
--
To unsubscribe from this list: send the line "unsubscribe linux-crypto" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/crypto/Kconfig b/crypto/Kconfig
index 88639937a934..3400cf4e3cdb 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -641,6 +641,9 @@  config CRYPTO_SHA256_SPARC64
 	  SHA-256 secure hash standard (DFIPS 180-2) implemented
 	  using sparc64 crypto instructions, when available.
 
+config CRYPTO_SHA512_BASE
+	tristate
+
 config CRYPTO_SHA512
 	tristate "SHA384 and SHA512 digest algorithms"
 	select CRYPTO_HASH
diff --git a/crypto/Makefile b/crypto/Makefile
index 97b7d3ac87e7..6174bf2592fe 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -45,6 +45,7 @@  obj-$(CONFIG_CRYPTO_RMD256) += rmd256.o
 obj-$(CONFIG_CRYPTO_RMD320) += rmd320.o
 obj-$(CONFIG_CRYPTO_SHA1) += sha1_generic.o
 obj-$(CONFIG_CRYPTO_SHA256) += sha256_generic.o
+obj-$(CONFIG_CRYPTO_SHA512_BASE) += sha512_base.o
 obj-$(CONFIG_CRYPTO_SHA512) += sha512_generic.o
 obj-$(CONFIG_CRYPTO_WP512) += wp512.o
 obj-$(CONFIG_CRYPTO_TGR192) += tgr192.o
diff --git a/crypto/sha512_base.c b/crypto/sha512_base.c
new file mode 100644
index 000000000000..488e24cc6f0a
--- /dev/null
+++ b/crypto/sha512_base.c
@@ -0,0 +1,143 @@ 
+/*
+ * sha512_base.c - core logic for SHA-512 implementations
+ *
+ * Copyright (C) 2015 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <crypto/internal/hash.h>
+#include <crypto/sha.h>
+#include <linux/crypto.h>
+#include <linux/module.h>
+
+#include <asm/unaligned.h>
+
+int sha384_base_init(struct shash_desc *desc)
+{
+	struct sha512_state *sctx = shash_desc_ctx(desc);
+
+	*sctx = (struct sha512_state){
+		.state = {
+			SHA384_H0, SHA384_H1, SHA384_H2, SHA384_H3,
+			SHA384_H4, SHA384_H5, SHA384_H6, SHA384_H7,
+		}
+	};
+	return 0;
+}
+EXPORT_SYMBOL(sha384_base_init);
+
+int sha512_base_init(struct shash_desc *desc)
+{
+	struct sha512_state *sctx = shash_desc_ctx(desc);
+
+	*sctx = (struct sha512_state){
+		.state = {
+			SHA512_H0, SHA512_H1, SHA512_H2, SHA512_H3,
+			SHA512_H4, SHA512_H5, SHA512_H6, SHA512_H7,
+		}
+	};
+	return 0;
+}
+EXPORT_SYMBOL(sha512_base_init);
+
+int sha512_base_export(struct shash_desc *desc, void *out)
+{
+	struct sha512_state *sctx = shash_desc_ctx(desc);
+	struct sha512_state *dst = out;
+
+	*dst = *sctx;
+
+	return 0;
+}
+EXPORT_SYMBOL(sha512_base_export);
+
+int sha512_base_import(struct shash_desc *desc, const void *in)
+{
+	struct sha512_state *sctx = shash_desc_ctx(desc);
+	struct sha512_state const *src = in;
+
+	*sctx = *src;
+
+	return 0;
+}
+EXPORT_SYMBOL(sha512_base_import);
+
+int sha512_base_do_update(struct shash_desc *desc, const u8 *data,
+			  unsigned int len, sha512_block_fn *block_fn, void *p)
+{
+	struct sha512_state *sctx = shash_desc_ctx(desc);
+	unsigned int partial = sctx->count[0] % SHA512_BLOCK_SIZE;
+
+	sctx->count[0] += len;
+	if (sctx->count[0] < len)
+		sctx->count[1]++;
+
+	if ((partial + len) >= SHA512_BLOCK_SIZE) {
+		int blocks;
+
+		if (partial) {
+			int p = SHA512_BLOCK_SIZE - partial;
+
+			memcpy(sctx->buf + partial, data, p);
+			data += p;
+			len -= p;
+		}
+
+		blocks = len / SHA512_BLOCK_SIZE;
+		len %= SHA512_BLOCK_SIZE;
+
+		block_fn(blocks, data, sctx->state,
+			 partial ? sctx->buf : NULL, p);
+		data += blocks * SHA512_BLOCK_SIZE;
+		partial = 0;
+	}
+	if (len)
+		memcpy(sctx->buf + partial, data, len);
+
+	return 0;
+}
+EXPORT_SYMBOL(sha512_base_do_update);
+
+int sha512_base_do_finalize(struct shash_desc *desc, sha512_block_fn *block_fn,
+			    void *p)
+{
+	static const u8 padding[SHA512_BLOCK_SIZE] = { 0x80, };
+
+	struct sha512_state *sctx = shash_desc_ctx(desc);
+	unsigned int padlen;
+	__be64 bits[2];
+
+	padlen = SHA512_BLOCK_SIZE -
+		 (sctx->count[0] + sizeof(bits)) % SHA512_BLOCK_SIZE;
+
+	bits[0] = cpu_to_be64(sctx->count[1] << 3 |
+			      sctx->count[0] >> 61);
+	bits[1] = cpu_to_be64(sctx->count[0] << 3);
+
+	sha512_base_do_update(desc, padding, padlen, block_fn, p);
+
+	memcpy(sctx->buf + SHA512_BLOCK_SIZE - sizeof(bits),
+	       bits, sizeof(bits));
+
+	block_fn(1, sctx->buf, sctx->state, NULL, p);
+	return 0;
+}
+EXPORT_SYMBOL(sha512_base_do_finalize);
+
+int sha512_base_finish(struct shash_desc *desc, u8 *out)
+{
+	unsigned int digest_size = crypto_shash_digestsize(desc->tfm);
+	struct sha512_state *sctx = shash_desc_ctx(desc);
+	__be64 *digest = (__be64 *)out;
+	int i;
+
+	for (i = 0; digest_size > 0; i++, digest_size -= sizeof(__be64))
+		put_unaligned_be64(sctx->state[i], digest++);
+
+	*sctx = (struct sha512_state){};
+	return 0;
+}
+EXPORT_SYMBOL(sha512_base_finish);
diff --git a/include/crypto/sha.h b/include/crypto/sha.h
index 190f8a0e0242..d252a8fcc4de 100644
--- a/include/crypto/sha.h
+++ b/include/crypto/sha.h
@@ -82,6 +82,9 @@  struct sha512_state {
 	u8 buf[SHA512_BLOCK_SIZE];
 };
 
+typedef void (sha512_block_fn)(int blocks, u8 const *src, u64 *state,
+			       const u8 *head, void *p);
+
 struct shash_desc;
 
 extern int crypto_sha1_update(struct shash_desc *desc, const u8 *data,
@@ -92,4 +95,21 @@  extern int crypto_sha256_update(struct shash_desc *desc, const u8 *data,
 
 extern int crypto_sha512_update(struct shash_desc *desc, const u8 *data,
 			      unsigned int len);
+
+
+extern int sha384_base_init(struct shash_desc *desc);
+extern int sha512_base_init(struct shash_desc *desc);
+
+extern int sha512_base_export(struct shash_desc *desc, void *out);
+extern int sha512_base_import(struct shash_desc *desc, const void *in);
+
+extern int sha512_base_do_update(struct shash_desc *desc, const u8 *data,
+			         unsigned int len, sha512_block_fn *block_fn,
+			         void *p);
+
+extern int sha512_base_do_finalize(struct shash_desc *desc,
+				   sha512_block_fn *block_fn, void *p);
+
+extern int sha512_base_finish(struct shash_desc *desc, u8 *out);
+
 #endif