@@ -220,6 +220,20 @@
rorq $32, RAB2; \
outunpack3(mov, RIO, 2, RAB, 2);
+#define outunpack_cbc_dec3() \
+ rorq $32, RCD0; \
+ rorq $32, RCD1; \
+ xorq (RT1), RCD1; \
+ rorq $32, RCD2; \
+ xorq 16(RT1), RCD2; \
+ outunpack3(mov, RIO, 0, RCD, 0); \
+ rorq $32, RAB0; \
+ rorq $32, RAB1; \
+ xorq 8(RT1), RAB1; \
+ rorq $32, RAB2; \
+ xorq 24(RT1), RAB2; \
+ outunpack3(mov, RIO, 2, RAB, 2);
+
SYM_FUNC_START(twofish_enc_blk_3way)
/* input:
* %rdi: ctx, CTX
@@ -255,17 +269,20 @@ SYM_FUNC_START(twofish_enc_blk_3way)
RET;
SYM_FUNC_END(twofish_enc_blk_3way)
-SYM_FUNC_START(twofish_dec_blk_3way)
+SYM_FUNC_START(__twofish_dec_blk_3way)
/* input:
* %rdi: ctx, CTX
* %rsi: dst
* %rdx: src, RIO
+ * %rcx: cbc (bool)
*/
pushq %r13;
pushq %r12;
pushq %rbx;
pushq %rsi; /* dst */
+ pushq %rdx; /* src */
+ pushq %rcx; /* cbc */
inpack_dec3();
@@ -280,12 +297,24 @@ SYM_FUNC_START(twofish_dec_blk_3way)
decrypt_cycle3(RAB, CD, 0);
pop_cd();
+ popq RT0; /* cbc */
+ popq RT1; /* src */
popq RIO; /* dst */
+ testq RT0, RT0;
+ jnz .L_dec_cbc;
+
outunpack_dec3();
popq %rbx;
popq %r12;
popq %r13;
RET;
-SYM_FUNC_END(twofish_dec_blk_3way)
+
+.L_dec_cbc:
+ outunpack_cbc_dec3();
+ popq %rbx;
+ popq %r12;
+ popq %r13;
+ RET;
+SYM_FUNC_END(__twofish_dec_blk_3way)
@@ -12,9 +12,19 @@ asmlinkage void twofish_dec_blk(const void *ctx, u8 *dst, const u8 *src);
/* 3-way parallel cipher functions */
asmlinkage void twofish_enc_blk_3way(const void *ctx, u8 *dst, const u8 *src);
-asmlinkage void twofish_dec_blk_3way(const void *ctx, u8 *dst, const u8 *src);
+asmlinkage void __twofish_dec_blk_3way(const void *ctx, u8 *dst,
+ const u8 *src, bool cbc);
-/* helpers from twofish_x86_64-3way module */
-extern void twofish_dec_blk_cbc_3way(const void *ctx, u8 *dst, const u8 *src);
+/* helpers for use of __twofish_dec_blk_3way() */
+static inline void twofish_dec_blk_3way(const void *ctx, u8 *dst,
+ const u8 *src)
+{
+ return __twofish_dec_blk_3way(ctx, dst, src, false);
+}
+static inline void twofish_dec_blk_cbc_3way(const void *ctx, u8 *dst,
+ const u8 *src)
+{
+ return __twofish_dec_blk_3way(ctx, dst, src, true);
+}
#endif /* ASM_X86_TWOFISH_H */
@@ -16,7 +16,7 @@
#include "ecb_cbc_helpers.h"
EXPORT_SYMBOL_GPL(twofish_enc_blk_3way);
-EXPORT_SYMBOL_GPL(twofish_dec_blk_3way);
+EXPORT_SYMBOL_GPL(__twofish_dec_blk_3way);
static int twofish_setkey_skcipher(struct crypto_skcipher *tfm,
const u8 *key, unsigned int keylen)
@@ -24,19 +24,6 @@ static int twofish_setkey_skcipher(struct crypto_skcipher *tfm,
return twofish_setkey(&tfm->base, key, keylen);
}
-void twofish_dec_blk_cbc_3way(const void *ctx, u8 *dst, const u8 *src)
-{
- u8 buf[2][TF_BLOCK_SIZE];
- const u8 *s = src;
-
- if (dst == src)
- s = memcpy(buf, src, sizeof(buf));
- twofish_dec_blk_3way(ctx, dst, src);
- crypto_xor(dst + TF_BLOCK_SIZE, s, sizeof(buf));
-
-}
-EXPORT_SYMBOL_GPL(twofish_dec_blk_cbc_3way);
-
static int ecb_encrypt(struct skcipher_request *req)
{
ECB_WALK_START(req, TF_BLOCK_SIZE, -1);
Optimize twofish-3way cbc decryption by keeping intermediate results in registers until computations are finished, rather than storing in assembly, then immediately reloading them in glue code. Additionally, keeping all operations in assembly can avoid a memcpy() call when the decryption is being done in place. cbc decoding speedups: (tcrypt mode=202 on a znver1) 64B: +7.7%, 128B: +6.3%, 256B: +6.8% Signed-off-by: Peter Lafreniere <peter@n8pjl.ca> --- arch/x86/crypto/twofish-x86_64-asm_64-3way.S | 33 ++++++++++++++++++-- arch/x86/crypto/twofish.h | 16 ++++++++-- arch/x86/crypto/twofish_glue_3way.c | 15 +-------- 3 files changed, 45 insertions(+), 19 deletions(-)