diff mbox series

[v2,04/20] crypto: arm/chacha - expose ARM ChaCha routine as library function

Message ID 20191002141713.31189-5-ard.biesheuvel@linaro.org
State New
Headers show
Series crypto: crypto API library interfaces for WireGuard | expand

Commit Message

Ard Biesheuvel Oct. 2, 2019, 2:16 p.m. UTC
Expose the accelerated NEON ChaCha routine directly as a symbol
export so that users of the ChaCha library can use it directly.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>

---
 arch/arm/crypto/Kconfig            |  1 +
 arch/arm/crypto/chacha-neon-glue.c | 40 ++++++++++++++++++--
 2 files changed, 38 insertions(+), 3 deletions(-)

-- 
2.20.1

Comments

Jason A. Donenfeld Oct. 4, 2019, 1:52 p.m. UTC | #1
On Wed, Oct 2, 2019 at 4:17 PM Ard Biesheuvel <ard.biesheuvel@linaro.org> wrote:
> Expose the accelerated NEON ChaCha routine directly as a symbol

> export so that users of the ChaCha library can use it directly.


Eric had some nice code for ChaCha for certain ARM cores that lived in
Zinc as chacha20-unrolled-arm.S. This code became active for certain
cores where NEON was bad and for cores with no NEON. The condition for
it was:

        switch (read_cpuid_part()) {
       case ARM_CPU_PART_CORTEX_A7:
       case ARM_CPU_PART_CORTEX_A5:
               /* The Cortex-A7 and Cortex-A5 do not perform well with the NEON
                * implementation but do incredibly with the scalar one and use
                * less power.
                */
               break;
       default:
               chacha20_use_neon = elf_hwcap & HWCAP_NEON;
       }

...

        for (;;) {
               if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && chacha20_use_neon &&
                   len >= CHACHA20_BLOCK_SIZE * 3 && simd_use(simd_context)) {
                       const size_t bytes = min_t(size_t, len, PAGE_SIZE);

                       chacha20_neon(dst, src, bytes, ctx->key, ctx->counter);
                       ctx->counter[0] += (bytes + 63) / 64;
                       len -= bytes;
                       if (!len)
                               break;
                       dst += bytes;
                       src += bytes;
                       simd_relax(simd_context);
               } else {
                       chacha20_arm(dst, src, len, ctx->key, ctx->counter);
                       ctx->counter[0] += (len + 63) / 64;
                       break;
               }
       }

It's another instance in which the generic code was totally optimized
out of Zinc builds.

Did these changes make it into the existing tree?
Ard Biesheuvel Oct. 4, 2019, 2:23 p.m. UTC | #2
On Fri, 4 Oct 2019 at 15:53, Jason A. Donenfeld <Jason@zx2c4.com> wrote:
>

> On Wed, Oct 2, 2019 at 4:17 PM Ard Biesheuvel <ard.biesheuvel@linaro.org> wrote:

> > Expose the accelerated NEON ChaCha routine directly as a symbol

> > export so that users of the ChaCha library can use it directly.

>

> Eric had some nice code for ChaCha for certain ARM cores that lived in

> Zinc as chacha20-unrolled-arm.S. This code became active for certain

> cores where NEON was bad and for cores with no NEON. The condition for

> it was:

>

>         switch (read_cpuid_part()) {

>        case ARM_CPU_PART_CORTEX_A7:

>        case ARM_CPU_PART_CORTEX_A5:

>                /* The Cortex-A7 and Cortex-A5 do not perform well with the NEON

>                 * implementation but do incredibly with the scalar one and use

>                 * less power.

>                 */

>                break;

>        default:

>                chacha20_use_neon = elf_hwcap & HWCAP_NEON;

>        }

>

> ...


How is it relevant whether the boot CPU is A5 or A7? These are bL
little cores that only implement NEON for feature parity with their bl
big counterparts, but CPU intensive tasks are scheduled on big cores,
where NEON performance is much better than scalar.

If we need a policy for this in the kernel, I'd prefer it to be one at
the arch/arm level where we disable kernel mode NEON entirely, either
via a command line option, or via a policy based on the the types of
all CPUs.

>

>         for (;;) {

>                if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && chacha20_use_neon &&

>                    len >= CHACHA20_BLOCK_SIZE * 3 && simd_use(simd_context)) {

>                        const size_t bytes = min_t(size_t, len, PAGE_SIZE);

>

>                        chacha20_neon(dst, src, bytes, ctx->key, ctx->counter);

>                        ctx->counter[0] += (bytes + 63) / 64;

>                        len -= bytes;

>                        if (!len)

>                                break;

>                        dst += bytes;

>                        src += bytes;

>                        simd_relax(simd_context);

>                } else {

>                        chacha20_arm(dst, src, len, ctx->key, ctx->counter);

>                        ctx->counter[0] += (len + 63) / 64;

>                        break;

>                }

>        }

>

> It's another instance in which the generic code was totally optimized

> out of Zinc builds.

>

> Did these changes make it into the existing tree?


I'd like to keep Eric's code, but if it is really that much faster, we
might drop it in arch/arm/lib so it supersedes the builtin code that
/dev/random uses as well.
Jason A. Donenfeld Oct. 4, 2019, 2:28 p.m. UTC | #3
On Fri, Oct 4, 2019 at 4:23 PM Ard Biesheuvel <ard.biesheuvel@linaro.org> wrote:
> > Did these changes make it into the existing tree?

>

> I'd like to keep Eric's code, but if it is really that much faster, we

> might drop it in arch/arm/lib so it supersedes the builtin code that

> /dev/random uses as well.


That was the idea with Zinc. For things like ARM and MIPS, the
optimized scalar code is really quite fast, and is worth using all the
time and not compiling the generic code at all.
Jason A. Donenfeld Oct. 4, 2019, 2:29 p.m. UTC | #4
On Fri, Oct 4, 2019 at 4:23 PM Ard Biesheuvel <ard.biesheuvel@linaro.org> wrote:
> How is it relevant whether the boot CPU is A5 or A7? These are bL

> little cores that only implement NEON for feature parity with their bl

> big counterparts, but CPU intensive tasks are scheduled on big cores,

> where NEON performance is much better than scalar.


Yea big-little might confuse things indeed. Though the performance
difference between the NEON code and the scalar code is not that huge,
and I suspect that big-little machines might benefit from
unconditionally using the scalar code, given that sometimes they might
wind up doing things on the little cores.

Eric - what did you guys wind up doing on Android with the fast scalar
implementation?
Arnd Bergmann Oct. 4, 2019, 3:24 p.m. UTC | #5
On Fri, Oct 4, 2019 at 4:23 PM Ard Biesheuvel <ard.biesheuvel@linaro.org> wrote:
>

> How is it relevant whether the boot CPU is A5 or A7? These are bL

> little cores that only implement NEON for feature parity with their bl

> big counterparts, but CPU intensive tasks are scheduled on big cores,

> where NEON performance is much better than scalar.

>

> If we need a policy for this in the kernel, I'd prefer it to be one at

> the arch/arm level where we disable kernel mode NEON entirely, either

> via a command line option, or via a policy based on the the types of

> all CPUs.


I don't think there was ever a b.L system with an A5, and most of the
A7+A15 systems did not age well, being high-end phone chips in
2014 that quickly got replaced with A53 parts and that no longer
get kernel upgrades.

The only chips I can think of that one might still care about here
are Exynos 542x (Chromebook 2 EOL 2019, Odroid XU4 ) and
Allwinner A80 (Cubieboard 4).

Just checking for Cortex-A7 being the boot CPU is probably
sufficient, that takes care of the common case of all the
A7-only embedded chips that people definitely are going to care
about for a long time.

      Arnd
Ard Biesheuvel Oct. 4, 2019, 3:35 p.m. UTC | #6
On Fri, 4 Oct 2019 at 17:24, Arnd Bergmann <arnd@arndb.de> wrote:
>

> On Fri, Oct 4, 2019 at 4:23 PM Ard Biesheuvel <ard.biesheuvel@linaro.org> wrote:

> >

> > How is it relevant whether the boot CPU is A5 or A7? These are bL

> > little cores that only implement NEON for feature parity with their bl

> > big counterparts, but CPU intensive tasks are scheduled on big cores,

> > where NEON performance is much better than scalar.

> >

> > If we need a policy for this in the kernel, I'd prefer it to be one at

> > the arch/arm level where we disable kernel mode NEON entirely, either

> > via a command line option, or via a policy based on the the types of

> > all CPUs.

>

> I don't think there was ever a b.L system with an A5, and most of the

> A7+A15 systems did not age well, being high-end phone chips in

> 2014 that quickly got replaced with A53 parts and that no longer

> get kernel upgrades.

>

> The only chips I can think of that one might still care about here

> are Exynos 542x (Chromebook 2 EOL 2019, Odroid XU4 ) and

> Allwinner A80 (Cubieboard 4).

>

> Just checking for Cortex-A7 being the boot CPU is probably

> sufficient, that takes care of the common case of all the

> A7-only embedded chips that people definitely are going to care

> about for a long time.

>


But do you agree that disabling kernel mode NEON altogether for these
systems is probably more sensible than testing for CPU part IDs in an
arbitrary crypto driver?
Jason A. Donenfeld Oct. 4, 2019, 3:38 p.m. UTC | #7
On Fri, Oct 4, 2019 at 5:36 PM Ard Biesheuvel <ard.biesheuvel@linaro.org> wrote:
> > Just checking for Cortex-A7 being the boot CPU is probably

> > sufficient, that takes care of the common case of all the

> > A7-only embedded chips that people definitely are going to care

> > about for a long time.

> >

>

> But do you agree that disabling kernel mode NEON altogether for these

> systems is probably more sensible than testing for CPU part IDs in an

> arbitrary crypto driver?


No. That NEON code is _still_ faster than the generic C code. But it
is not as fast as the scalar code. There might be another primitive
that has a fast NEON implementation but does not have a fast scalar
implementation. The choice there would be between fast NEON and slow
generic. In that case, we want fast NEON. Also, different algorithms
lend themselves to different implementation strategies. Leave this up
to the chacha code, as Zinc does it, since this is the place that has
the most information to decide what it should be running.
Eric Biggers Oct. 4, 2019, 3:43 p.m. UTC | #8
On Fri, Oct 04, 2019 at 04:29:57PM +0200, Jason A. Donenfeld wrote:
> On Fri, Oct 4, 2019 at 4:23 PM Ard Biesheuvel <ard.biesheuvel@linaro.org> wrote:

> > How is it relevant whether the boot CPU is A5 or A7? These are bL

> > little cores that only implement NEON for feature parity with their bl

> > big counterparts, but CPU intensive tasks are scheduled on big cores,

> > where NEON performance is much better than scalar.

> 

> Yea big-little might confuse things indeed. Though the performance

> difference between the NEON code and the scalar code is not that huge,

> and I suspect that big-little machines might benefit from

> unconditionally using the scalar code, given that sometimes they might

> wind up doing things on the little cores.

> 

> Eric - what did you guys wind up doing on Android with the fast scalar

> implementation?


We're still just using the NEON implementation from the upstream kernel instead.

- Eric
diff mbox series

Patch

diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig
index b24df84a1d7a..70e4d5fe5bdb 100644
--- a/arch/arm/crypto/Kconfig
+++ b/arch/arm/crypto/Kconfig
@@ -130,6 +130,7 @@  config CRYPTO_CHACHA20_NEON
 	depends on KERNEL_MODE_NEON
 	select CRYPTO_BLKCIPHER
 	select CRYPTO_CHACHA20
+	select CRYPTO_ARCH_HAVE_LIB_CHACHA
 
 config CRYPTO_NHPOLY1305_NEON
 	tristate "NEON accelerated NHPoly1305 hash function (for Adiantum)"
diff --git a/arch/arm/crypto/chacha-neon-glue.c b/arch/arm/crypto/chacha-neon-glue.c
index 26576772f18b..eee0f6e4f5d2 100644
--- a/arch/arm/crypto/chacha-neon-glue.c
+++ b/arch/arm/crypto/chacha-neon-glue.c
@@ -36,6 +36,8 @@  asmlinkage void chacha_4block_xor_neon(const u32 *state, u8 *dst, const u8 *src,
 				       int nrounds);
 asmlinkage void hchacha_block_neon(const u32 *state, u32 *out, int nrounds);
 
+static bool have_neon __ro_after_init;
+
 static void chacha_doneon(u32 *state, u8 *dst, const u8 *src,
 			  unsigned int bytes, int nrounds)
 {
@@ -62,6 +64,36 @@  static void chacha_doneon(u32 *state, u8 *dst, const u8 *src,
 	}
 }
 
+void hchacha_block(const u32 *state, u32 *stream, int nrounds)
+{
+	if (!crypto_simd_usable()) {
+		hchacha_block_generic(state, stream, nrounds);
+	} else {
+		kernel_neon_begin();
+		hchacha_block_neon(state, stream, nrounds);
+		kernel_neon_end();
+	}
+}
+EXPORT_SYMBOL(hchacha_block);
+
+void chacha_init(u32 *state, const u32 *key, const u8 *iv)
+{
+	chacha_init_generic(state, key, iv);
+}
+EXPORT_SYMBOL(chacha_init);
+
+void chacha_crypt(u32 *state, u8 *dst, const u8 *src, unsigned int bytes,
+		  int nrounds)
+{
+	if (!have_neon || bytes <= CHACHA_BLOCK_SIZE || !crypto_simd_usable())
+		return chacha_crypt_generic(state, dst, src, bytes, nrounds);
+
+	kernel_neon_begin();
+	chacha_doneon(state, dst, src, bytes, nrounds);
+	kernel_neon_end();
+}
+EXPORT_SYMBOL(chacha_crypt);
+
 static int chacha_neon_stream_xor(struct skcipher_request *req,
 				  const struct chacha_ctx *ctx, const u8 *iv)
 {
@@ -177,15 +209,17 @@  static struct skcipher_alg algs[] = {
 
 static int __init chacha_simd_mod_init(void)
 {
-	if (!(elf_hwcap & HWCAP_NEON))
-		return -ENODEV;
+	have_neon = (elf_hwcap & HWCAP_NEON);
+	if (!have_neon)
+		return 0;
 
 	return crypto_register_skciphers(algs, ARRAY_SIZE(algs));
 }
 
 static void __exit chacha_simd_mod_fini(void)
 {
-	crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
+	if (have_neon)
+		crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
 }
 
 module_init(chacha_simd_mod_init);