@@ -2578,8 +2578,95 @@ SYM_FUNC_START(aesni_cbc_dec)
SYM_FUNC_END(aesni_cbc_dec)
#ifdef __x86_64__
+/*
+ * void aesni_cts_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
+ * size_t len, u8 *iv)
+ */
+SYM_FUNC_START(aesni_cts_cbc_enc)
+ FRAME_BEGIN
+ mov 480(KEYP), KLEN
+ lea .Lcts_permute_table(%rip), T1
+ sub $16, LEN
+ mov T1, T2
+ add $32, T2
+ add LEN, T1
+ sub LEN, T2
+ movups (T1), %xmm4
+ movups (T2), %xmm5
+
+ movups (INP), IN1
+ add LEN, INP
+ movups (INP), IN2
+
+ movups (IVP), STATE
+ pxor IN1, STATE
+ call _aesni_enc1
+
+ pshufb %xmm5, IN2
+ pxor STATE, IN2
+ pshufb %xmm4, STATE
+ add OUTP, LEN
+ movups STATE, (LEN)
+
+ movaps IN2, STATE
+ call _aesni_enc1
+ movups STATE, (OUTP)
+
+ FRAME_END
+ ret
+SYM_FUNC_END(aesni_cts_cbc_enc)
+
+/*
+ * void aesni_cts_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
+ * size_t len, u8 *iv)
+ */
+SYM_FUNC_START(aesni_cts_cbc_dec)
+ FRAME_BEGIN
+ mov 480(KEYP), KLEN
+ add $240, KEYP
+ lea .Lcts_permute_table(%rip), T1
+ sub $16, LEN
+ mov T1, T2
+ add $32, T2
+ add LEN, T1
+ sub LEN, T2
+ movups (T1), %xmm4
+
+ movups (INP), STATE
+ add LEN, INP
+ movups (INP), IN1
+
+ call _aesni_dec1
+ movaps STATE, IN2
+ pshufb %xmm4, STATE
+ pxor IN1, STATE
+
+ add OUTP, LEN
+ movups STATE, (LEN)
+
+ movups (T2), %xmm0
+ pshufb %xmm0, IN1
+ pblendvb IN2, IN1
+ movaps IN1, STATE
+ call _aesni_dec1
+
+ movups (IVP), IN1
+ pxor IN1, STATE
+ movups STATE, (OUTP)
+
+ FRAME_END
+ ret
+SYM_FUNC_END(aesni_cts_cbc_dec)
+
.pushsection .rodata
.align 16
+.Lcts_permute_table:
+ .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
+ .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
+ .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
+ .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+ .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
+ .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
.Lbswap_mask:
.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
.popsection
@@ -93,6 +93,10 @@ asmlinkage void aesni_cbc_enc(struct crypto_aes_ctx *ctx, u8 *out,
const u8 *in, unsigned int len, u8 *iv);
asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out,
const u8 *in, unsigned int len, u8 *iv);
+asmlinkage void aesni_cts_cbc_enc(struct crypto_aes_ctx *ctx, u8 *out,
+ const u8 *in, unsigned int len, u8 *iv);
+asmlinkage void aesni_cts_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out,
+ const u8 *in, unsigned int len, u8 *iv);
#define AVX_GEN2_OPTSIZE 640
#define AVX_GEN4_OPTSIZE 4096
@@ -454,6 +458,118 @@ static int cbc_decrypt(struct skcipher_request *req)
return err;
}
+static int cts_cbc_encrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm));
+ int cbc_blocks = DIV_ROUND_UP(req->cryptlen, AES_BLOCK_SIZE) - 2;
+ struct scatterlist *src = req->src, *dst = req->dst;
+ struct scatterlist sg_src[2], sg_dst[2];
+ struct skcipher_request subreq;
+ struct skcipher_walk walk;
+ int err;
+
+ skcipher_request_set_tfm(&subreq, tfm);
+ skcipher_request_set_callback(&subreq, skcipher_request_flags(req),
+ NULL, NULL);
+
+ if (req->cryptlen <= AES_BLOCK_SIZE) {
+ if (req->cryptlen < AES_BLOCK_SIZE)
+ return -EINVAL;
+ cbc_blocks = 1;
+ }
+
+ if (cbc_blocks > 0) {
+ skcipher_request_set_crypt(&subreq, req->src, req->dst,
+ cbc_blocks * AES_BLOCK_SIZE,
+ req->iv);
+
+ err = cbc_encrypt(&subreq);
+ if (err)
+ return err;
+
+ if (req->cryptlen == AES_BLOCK_SIZE)
+ return 0;
+
+ dst = src = scatterwalk_ffwd(sg_src, req->src, subreq.cryptlen);
+ if (req->dst != req->src)
+ dst = scatterwalk_ffwd(sg_dst, req->dst,
+ subreq.cryptlen);
+ }
+
+ /* handle ciphertext stealing */
+ skcipher_request_set_crypt(&subreq, src, dst,
+ req->cryptlen - cbc_blocks * AES_BLOCK_SIZE,
+ req->iv);
+
+ err = skcipher_walk_virt(&walk, &subreq, false);
+ if (err)
+ return err;
+
+ kernel_fpu_begin();
+ aesni_cts_cbc_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
+ walk.nbytes, walk.iv);
+ kernel_fpu_end();
+
+ return skcipher_walk_done(&walk, 0);
+}
+
+static int cts_cbc_decrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm));
+ int cbc_blocks = DIV_ROUND_UP(req->cryptlen, AES_BLOCK_SIZE) - 2;
+ struct scatterlist *src = req->src, *dst = req->dst;
+ struct scatterlist sg_src[2], sg_dst[2];
+ struct skcipher_request subreq;
+ struct skcipher_walk walk;
+ int err;
+
+ skcipher_request_set_tfm(&subreq, tfm);
+ skcipher_request_set_callback(&subreq, skcipher_request_flags(req),
+ NULL, NULL);
+
+ if (req->cryptlen <= AES_BLOCK_SIZE) {
+ if (req->cryptlen < AES_BLOCK_SIZE)
+ return -EINVAL;
+ cbc_blocks = 1;
+ }
+
+ if (cbc_blocks > 0) {
+ skcipher_request_set_crypt(&subreq, req->src, req->dst,
+ cbc_blocks * AES_BLOCK_SIZE,
+ req->iv);
+
+ err = cbc_decrypt(&subreq);
+ if (err)
+ return err;
+
+ if (req->cryptlen == AES_BLOCK_SIZE)
+ return 0;
+
+ dst = src = scatterwalk_ffwd(sg_src, req->src, subreq.cryptlen);
+ if (req->dst != req->src)
+ dst = scatterwalk_ffwd(sg_dst, req->dst,
+ subreq.cryptlen);
+ }
+
+ /* handle ciphertext stealing */
+ skcipher_request_set_crypt(&subreq, src, dst,
+ req->cryptlen - cbc_blocks * AES_BLOCK_SIZE,
+ req->iv);
+
+ err = skcipher_walk_virt(&walk, &subreq, false);
+ if (err)
+ return err;
+
+ kernel_fpu_begin();
+ aesni_cts_cbc_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr,
+ walk.nbytes, walk.iv);
+ kernel_fpu_end();
+
+ return skcipher_walk_done(&walk, 0);
+}
+
#ifdef CONFIG_X86_64
static void ctr_crypt_final(struct crypto_aes_ctx *ctx,
struct skcipher_walk *walk)
@@ -929,6 +1045,23 @@ static struct skcipher_alg aesni_skciphers[] = {
.encrypt = cbc_encrypt,
.decrypt = cbc_decrypt,
#ifdef CONFIG_X86_64
+ }, {
+ .base = {
+ .cra_name = "__cts(cbc(aes))",
+ .cra_driver_name = "__cts-cbc-aes-aesni",
+ .cra_priority = 400,
+ .cra_flags = CRYPTO_ALG_INTERNAL,
+ .cra_blocksize = AES_BLOCK_SIZE,
+ .cra_ctxsize = CRYPTO_AES_CTX_SIZE,
+ .cra_module = THIS_MODULE,
+ },
+ .min_keysize = AES_MIN_KEY_SIZE,
+ .max_keysize = AES_MAX_KEY_SIZE,
+ .ivsize = AES_BLOCK_SIZE,
+ .walksize = 2 * AES_BLOCK_SIZE,
+ .setkey = aesni_skcipher_setkey,
+ .encrypt = cts_cbc_encrypt,
+ .decrypt = cts_cbc_decrypt,
}, {
.base = {
.cra_name = "__ctr(aes)",
Follow the same approach as the arm64 driver for implementing a version of AES-NI in CBC mode that supports ciphertext stealing. Compared to the generic CTS template wrapped around the existing cbc-aes-aesni skcipher, this results in a ~2x speed increase for relatively short inputs (less than 256 bytes), which is relevant given that AES-CBC with ciphertext stealing is used for filename encryption in the fscrypt layer. For larger inputs, the speedup is still significant (~25% on decryption, ~6% on encryption). Signed-off-by: Ard Biesheuvel <ardb@kernel.org> --- Full tcrypt benchmark results for cts(cbc-aes-aesni) vs cts-cbc-aes-aesni after the diff (Intel(R) Core(TM) i7-8650U CPU @ 1.90GHz) arch/x86/crypto/aesni-intel_asm.S | 87 +++++++++++++ arch/x86/crypto/aesni-intel_glue.c | 133 ++++++++++++++++++++ 2 files changed, 220 insertions(+)