diff mbox series

[v4,4/8] crypto: x86/aesni-xctr: Add accelerated implementation of XCTR

Message ID 20220412172816.917723-5-nhuck@google.com
State New
Headers show
Series crypto: HCTR2 support | expand

Commit Message

Nathan Huckleberry April 12, 2022, 5:28 p.m. UTC
Add hardware accelerated versions of XCTR for x86-64 CPUs with AESNI
support.  These implementations are modified versions of the CTR
implementations found in aesni-intel_asm.S and aes_ctrby8_avx-x86_64.S.

More information on XCTR can be found in the HCTR2 paper:
Length-preserving encryption with HCTR2:
https://enterprint.iacr.org/2021/1441.pdf

Signed-off-by: Nathan Huckleberry <nhuck@google.com>
---
 arch/x86/crypto/aes_ctrby8_avx-x86_64.S | 233 ++++++++++++++++--------
 arch/x86/crypto/aesni-intel_asm.S       |  70 +++++++
 arch/x86/crypto/aesni-intel_glue.c      |  89 +++++++++
 crypto/Kconfig                          |   2 +-
 4 files changed, 317 insertions(+), 77 deletions(-)

Comments

Eric Biggers April 14, 2022, 7 a.m. UTC | #1
A few initial comments, I'll take a closer look at the .S file soon...

On Tue, Apr 12, 2022 at 05:28:12PM +0000, Nathan Huckleberry wrote:
> Add hardware accelerated versions of XCTR for x86-64 CPUs with AESNI
> support.  These implementations are modified versions of the CTR
> implementations found in aesni-intel_asm.S and aes_ctrby8_avx-x86_64.S.
> 
> More information on XCTR can be found in the HCTR2 paper:
> Length-preserving encryption with HCTR2:
> https://enterprint.iacr.org/2021/1441.pdf

The above link doesn't work.

> +#ifdef __x86_64__
> +/*
> + * void aesni_xctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
> + *		      size_t len, u8 *iv, int byte_ctr)
> + */

This prototype doesn't match the one declared in the .c file.

> +
> +asmlinkage void aes_xctr_enc_128_avx_by8(const u8 *in, u8 *iv, void *keys, u8
> +	*out, unsigned int num_bytes, unsigned int byte_ctr);
> +
> +asmlinkage void aes_xctr_enc_192_avx_by8(const u8 *in, u8 *iv, void *keys, u8
> +	*out, unsigned int num_bytes, unsigned int byte_ctr);
> +
> +asmlinkage void aes_xctr_enc_256_avx_by8(const u8 *in, u8 *iv, void *keys, u8
> +	*out, unsigned int num_bytes, unsigned int byte_ctr);

Please don't have line breaks between parameter types and their names.
These should look like:

asmlinkage void aes_xctr_enc_128_avx_by8(const u8 *in, u8 *iv, void *keys,
	u8 *out, unsigned int num_bytes, unsigned int byte_ctr);

Also, why aren't the keys const?

> +static void aesni_xctr_enc_avx_tfm(struct crypto_aes_ctx *ctx, u8 *out, const u8
> +				   *in, unsigned int len, u8 *iv, unsigned int
> +				   byte_ctr)
> +{
> +	if (ctx->key_length == AES_KEYSIZE_128)
> +		aes_xctr_enc_128_avx_by8(in, iv, (void *)ctx, out, len,
> +					 byte_ctr);
> +	else if (ctx->key_length == AES_KEYSIZE_192)
> +		aes_xctr_enc_192_avx_by8(in, iv, (void *)ctx, out, len,
> +					 byte_ctr);
> +	else
> +		aes_xctr_enc_256_avx_by8(in, iv, (void *)ctx, out, len,
> +					 byte_ctr);
> +}

Same comments above.

> +static int xctr_crypt(struct skcipher_request *req)
> +{
> +	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
> +	struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm));
> +	u8 keystream[AES_BLOCK_SIZE];
> +	u8 ctr[AES_BLOCK_SIZE];
> +	struct skcipher_walk walk;
> +	unsigned int nbytes;
> +	unsigned int byte_ctr = 0;
> +	int err;
> +	__le32 ctr32;
> +
> +	err = skcipher_walk_virt(&walk, req, false);
> +
> +	while ((nbytes = walk.nbytes) > 0) {
> +		kernel_fpu_begin();
> +		if (nbytes & AES_BLOCK_MASK)
> +			static_call(aesni_xctr_enc_tfm)(ctx, walk.dst.virt.addr,
> +				walk.src.virt.addr, nbytes & AES_BLOCK_MASK,
> +				walk.iv, byte_ctr);
> +		nbytes &= ~AES_BLOCK_MASK;
> +		byte_ctr += walk.nbytes - nbytes;
> +
> +		if (walk.nbytes == walk.total && nbytes > 0) {
> +			ctr32 = cpu_to_le32(byte_ctr / AES_BLOCK_SIZE + 1);
> +			memcpy(ctr, walk.iv, AES_BLOCK_SIZE);
> +			crypto_xor(ctr, (u8 *)&ctr32, sizeof(ctr32));
> +			aesni_enc(ctx, keystream, ctr);
> +			crypto_xor_cpy(walk.dst.virt.addr + walk.nbytes -
> +				       nbytes, walk.src.virt.addr + walk.nbytes
> +				       - nbytes, keystream, nbytes);
> +			byte_ctr += nbytes;
> +			nbytes = 0;
> +		}

For the final block case, it would be a bit simpler to do something like this:

	__le32 block[AES_BLOCK_SIZE / sizeof(__le32)]

	
	...
	memcpy(block, walk.iv, AES_BLOCK_SIZE);
	block[0] ^= cpu_to_le32(1 + byte_ctr / AES_BLOCK_SIZE);
	aesni_enc(ctx, (u8 *)block, (u8 *)block);

I.e., have one buffer, use a regular XOR instead of crypto_xor(), and encrypt it
in-place.

> @@ -1162,6 +1249,8 @@ static int __init aesni_init(void)
>  		/* optimize performance of ctr mode encryption transform */
>  		static_call_update(aesni_ctr_enc_tfm, aesni_ctr_enc_avx_tfm);
>  		pr_info("AES CTR mode by8 optimization enabled\n");
> +		static_call_update(aesni_xctr_enc_tfm, aesni_xctr_enc_avx_tfm);
> +		pr_info("AES XCTR mode by8 optimization enabled\n");
>  	}

Please don't add the log message above, as it would get printed at every boot-up
on most x86 systems, and it's not important enough for that.  The existing
message "AES CTR mode ..." shouldn't really exist in the first place.

- Eric
Eric Biggers April 18, 2022, 11:44 p.m. UTC | #2
On Tue, Apr 12, 2022 at 05:28:12PM +0000, Nathan Huckleberry wrote:
> diff --git a/arch/x86/crypto/aes_ctrby8_avx-x86_64.S b/arch/x86/crypto/aes_ctrby8_avx-x86_64.S
> index 43852ba6e19c..9e20d7d3d6da 100644
> --- a/arch/x86/crypto/aes_ctrby8_avx-x86_64.S
> +++ b/arch/x86/crypto/aes_ctrby8_avx-x86_64.S
> @@ -53,6 +53,10 @@
>  #define KEY_192		2
>  #define KEY_256		3
>  
> +// XCTR mode only
> +#define counter		%r9
> +#define xiv		%xmm8
> +

It would be helpful if the registers were listed in order, and if the
CTR-specific ones were marked as being specific to CTR.  This would make it easy
to verify that there are no collisions in register allocation.  I.e.:

[...]
#define xdata7		%xmm7
#define xcounter	%xmm8	// CTR mode only
#define xiv		%xmm8	// XCTR mode only
#define xbyteswap	%xmm9	// CTR mode only
#define xkey0		%xmm10
[...]
#define num_bytes	%r8
#define counter		%r9	// XCTR mode only
#define tmp		%r10
[...]


I'm also not a fan of the naming, with "xcounter" being used by CTR only and
"counter" being used by XCTR only...  I see why you did it, though, as the
existing code uses the "x" prefix to mean "this is an xmm register".  It could
at least use a comment that makes this super clear, though:

// Note: the "x" prefix in these aliases means "this is an xmm register".
// No relation to XCTR where the "X" prefix means "XOR counter".
#define xdata0		%xmm0

> +	.if (\xctr == 1)

As \xctr is either 0 or 1, this can be written as simply '.if \xctr'

> +		.set i, 0
> +		.rept (by)
> +			club XDATA, i
> +			movq counter, var_xdata
> +			.set i, (i +1)
> +		.endr
> +	.endif
> +

Since the 3-operand add instruction (vpaddq) is available here, and in fact is
being used already, it isn't necessary to move 'counter' into all (up to 8) of
the var_xdata registers.  Just move it into the last var_xdata register, or into
a temporary register, and use it as a source operand for all the additions.

> -	vpshufb	xbyteswap, xcounter, xdata0
> -
> -	.set i, 1
> -	.rept (by - 1)
> -		club XDATA, i
> -		vpaddq	(ddq_add_1 + 16 * (i - 1))(%rip), xcounter, var_xdata
> -		vptest	ddq_low_msk(%rip), var_xdata
> -		jnz 1f
> -		vpaddq	ddq_high_add_1(%rip), var_xdata, var_xdata
> -		vpaddq	ddq_high_add_1(%rip), xcounter, xcounter
> -		1:
> -		vpshufb	xbyteswap, var_xdata, var_xdata
> -		.set i, (i +1)
> -	.endr
> +	.if (\xctr == 0)
> +		vpshufb	xbyteswap, xcounter, xdata0
> +		.set i, 1
> +		.rept (by - 1)
> +			club XDATA, i
> +			vpaddq	(ddq_add_1 + 16 * (i - 1))(%rip), xcounter, var_xdata
> +			vptest	ddq_low_msk(%rip), var_xdata
> +			jnz 1f
> +			vpaddq	ddq_high_add_1(%rip), var_xdata, var_xdata
> +			vpaddq	ddq_high_add_1(%rip), xcounter, xcounter
> +			1:
> +			vpshufb	xbyteswap, var_xdata, var_xdata
> +			.set i, (i +1)
> +		.endr
> +	.endif
> +	.if (\xctr == 1)
> +		.set i, 0
> +		.rept (by)
> +			club XDATA, i
> +			vpaddq	(ddq_add_1 + 16 * i)(%rip), var_xdata, var_xdata
> +			.set i, (i +1)
> +		.endr
> +		.set i, 0
> +		.rept (by)
> +			club	XDATA, i
> +			vpxor	xiv, var_xdata, var_xdata
> +			.set i, (i +1)
> +		.endr
> +	.endif

This can be written as:

	.if \xctr
	[second part above]
	.else
	[first part above]
	.endif

> -	vpaddq	(ddq_add_1 + 16 * (by - 1))(%rip), xcounter, xcounter
> -	vptest	ddq_low_msk(%rip), xcounter
> -	jnz	1f
> -	vpaddq	ddq_high_add_1(%rip), xcounter, xcounter
> -	1:
> +	.if (\xctr == 0)
> +		vpaddq	(ddq_add_1 + 16 * (by - 1))(%rip), xcounter, xcounter
> +		vptest	ddq_low_msk(%rip), xcounter
> +		jnz	1f
> +		vpaddq	ddq_high_add_1(%rip), xcounter, xcounter
> +		1:
> +	.endif
> +	.if (\xctr == 1)
> +		add $by, counter
> +	.endif

Likewise here.

> +.macro do_aes_ctrmain key_len, xctr
>  	cmp	$16, num_bytes
> -	jb	.Ldo_return2\key_len
> +	jb	.Ldo_return2\xctr\key_len
>  
>  	vmovdqa	byteswap_const(%rip), xbyteswap
> -	vmovdqu	(p_iv), xcounter
> -	vpshufb	xbyteswap, xcounter, xcounter
> +	.if (\xctr == 0)
> +		vmovdqu	(p_iv), xcounter
> +		vpshufb	xbyteswap, xcounter, xcounter
> +	.endif
> +	.if (\xctr == 1)
> +		andq	$(~0xf), num_bytes
> +		shr	$4, counter
> +		vmovdqu	(p_iv), xiv
> +	.endif

And likewise here.  Also, the load of byteswap_const can be moved into the
!\xctr block.

- Eric
Nathan Huckleberry April 21, 2022, 9:59 p.m. UTC | #3
On Mon, Apr 18, 2022 at 7:13 PM Eric Biggers <ebiggers@kernel.org> wrote:
>
> On Tue, Apr 12, 2022 at 05:28:12PM +0000, Nathan Huckleberry wrote:
> > diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
> > index 363699dd7220..ce17fe630150 100644
> > --- a/arch/x86/crypto/aesni-intel_asm.S
> > +++ b/arch/x86/crypto/aesni-intel_asm.S
> > @@ -2821,6 +2821,76 @@ SYM_FUNC_END(aesni_ctr_enc)
> >
> >  #endif
> >
> > +#ifdef __x86_64__
> > +/*
> > + * void aesni_xctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
> > + *                 size_t len, u8 *iv, int byte_ctr)
> > + */
> > +SYM_FUNC_START(aesni_xctr_enc)
> > +     FRAME_BEGIN
> > +     cmp $16, LEN
> > +     jb .Lxctr_ret
> > +     shr     $4, %arg6
> > +     movq %arg6, CTR
> > +     mov 480(KEYP), KLEN
> > +     movups (IVP), IV
> > +     cmp $64, LEN
> > +     jb .Lxctr_enc_loop1
> > +.align 4
> > +.Lxctr_enc_loop4:
> > +     movaps IV, STATE1
> > +     vpaddq ONE(%rip), CTR, CTR
> > +     vpxor CTR, STATE1, STATE1
> > +     movups (INP), IN1
> > +     movaps IV, STATE2
> > +     vpaddq ONE(%rip), CTR, CTR
> > +     vpxor CTR, STATE2, STATE2
> > +     movups 0x10(INP), IN2
> > +     movaps IV, STATE3
> > +     vpaddq ONE(%rip), CTR, CTR
> > +     vpxor CTR, STATE3, STATE3
> > +     movups 0x20(INP), IN3
> > +     movaps IV, STATE4
> > +     vpaddq ONE(%rip), CTR, CTR
> > +     vpxor CTR, STATE4, STATE4
> > +     movups 0x30(INP), IN4
> > +     call _aesni_enc4
> > +     pxor IN1, STATE1
> > +     movups STATE1, (OUTP)
> > +     pxor IN2, STATE2
> > +     movups STATE2, 0x10(OUTP)
> > +     pxor IN3, STATE3
> > +     movups STATE3, 0x20(OUTP)
> > +     pxor IN4, STATE4
> > +     movups STATE4, 0x30(OUTP)
> > +     sub $64, LEN
> > +     add $64, INP
> > +     add $64, OUTP
> > +     cmp $64, LEN
> > +     jge .Lxctr_enc_loop4
> > +     cmp $16, LEN
> > +     jb .Lxctr_ret
> > +.align 4
> > +.Lxctr_enc_loop1:
> > +     movaps IV, STATE
> > +     vpaddq ONE(%rip), CTR, CTR
> > +     vpxor CTR, STATE1, STATE1
> > +     movups (INP), IN
> > +     call _aesni_enc1
> > +     pxor IN, STATE
> > +     movups STATE, (OUTP)
> > +     sub $16, LEN
> > +     add $16, INP
> > +     add $16, OUTP
> > +     cmp $16, LEN
> > +     jge .Lxctr_enc_loop1
> > +.Lxctr_ret:
> > +     FRAME_END
> > +     RET
> > +SYM_FUNC_END(aesni_xctr_enc)
> > +
> > +#endif
>
> Sorry, I missed this file.  This is the non-AVX version, right?  That means that
> AVX instructions, i.e. basically anything instruction starting with "v", can't
> be used here.  So the above isn't going to work.  (There might be a way to test
> this with QEMU; maybe --cpu-type=Nehalem without --enable-kvm?)
>
> You could rewrite this without using AVX instructions.  However, polyval-clmulni
> is broken in the same way; it uses AVX instructions without checking whether
> they are available.  But your patchset doesn't aim to provide a non-AVX polyval
> implementation at all.  So even if you got the non-AVX XCTR working, it wouldn't
> be paired with an accelerated polyval.
>
> So I think you should just not provide non-AVX versions for now.  That would
> mean:
>
>         1.) Drop the change to aesni-intel_asm.S
>         2.) Don't register the AES XCTR algorithm unless AVX is available
>             (in addition to AES-NI)

Is there a preferred way to conditionally register xctr? It looks like
aesni-intel_glue.c registers a default implementation for all the
algorithms in the array, then better versions are enabled depending on
cpu features. Should I remove xctr from the list of other algorithms
and register it separately?

>         3.) Don't register polyval-clmulni unless AVX is available
>             (in addition to CLMUL-NI)
>
> - Eric
Eric Biggers April 21, 2022, 10:29 p.m. UTC | #4
On Thu, Apr 21, 2022 at 04:59:31PM -0500, Nathan Huckleberry wrote:
> On Mon, Apr 18, 2022 at 7:13 PM Eric Biggers <ebiggers@kernel.org> wrote:
> >
> > On Tue, Apr 12, 2022 at 05:28:12PM +0000, Nathan Huckleberry wrote:
> > > diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
> > > index 363699dd7220..ce17fe630150 100644
> > > --- a/arch/x86/crypto/aesni-intel_asm.S
> > > +++ b/arch/x86/crypto/aesni-intel_asm.S
> > > @@ -2821,6 +2821,76 @@ SYM_FUNC_END(aesni_ctr_enc)
> > >
> > >  #endif
> > >
> > > +#ifdef __x86_64__
> > > +/*
> > > + * void aesni_xctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
> > > + *                 size_t len, u8 *iv, int byte_ctr)
> > > + */
> > > +SYM_FUNC_START(aesni_xctr_enc)
> > > +     FRAME_BEGIN
> > > +     cmp $16, LEN
> > > +     jb .Lxctr_ret
> > > +     shr     $4, %arg6
> > > +     movq %arg6, CTR
> > > +     mov 480(KEYP), KLEN
> > > +     movups (IVP), IV
> > > +     cmp $64, LEN
> > > +     jb .Lxctr_enc_loop1
> > > +.align 4
> > > +.Lxctr_enc_loop4:
> > > +     movaps IV, STATE1
> > > +     vpaddq ONE(%rip), CTR, CTR
> > > +     vpxor CTR, STATE1, STATE1
> > > +     movups (INP), IN1
> > > +     movaps IV, STATE2
> > > +     vpaddq ONE(%rip), CTR, CTR
> > > +     vpxor CTR, STATE2, STATE2
> > > +     movups 0x10(INP), IN2
> > > +     movaps IV, STATE3
> > > +     vpaddq ONE(%rip), CTR, CTR
> > > +     vpxor CTR, STATE3, STATE3
> > > +     movups 0x20(INP), IN3
> > > +     movaps IV, STATE4
> > > +     vpaddq ONE(%rip), CTR, CTR
> > > +     vpxor CTR, STATE4, STATE4
> > > +     movups 0x30(INP), IN4
> > > +     call _aesni_enc4
> > > +     pxor IN1, STATE1
> > > +     movups STATE1, (OUTP)
> > > +     pxor IN2, STATE2
> > > +     movups STATE2, 0x10(OUTP)
> > > +     pxor IN3, STATE3
> > > +     movups STATE3, 0x20(OUTP)
> > > +     pxor IN4, STATE4
> > > +     movups STATE4, 0x30(OUTP)
> > > +     sub $64, LEN
> > > +     add $64, INP
> > > +     add $64, OUTP
> > > +     cmp $64, LEN
> > > +     jge .Lxctr_enc_loop4
> > > +     cmp $16, LEN
> > > +     jb .Lxctr_ret
> > > +.align 4
> > > +.Lxctr_enc_loop1:
> > > +     movaps IV, STATE
> > > +     vpaddq ONE(%rip), CTR, CTR
> > > +     vpxor CTR, STATE1, STATE1
> > > +     movups (INP), IN
> > > +     call _aesni_enc1
> > > +     pxor IN, STATE
> > > +     movups STATE, (OUTP)
> > > +     sub $16, LEN
> > > +     add $16, INP
> > > +     add $16, OUTP
> > > +     cmp $16, LEN
> > > +     jge .Lxctr_enc_loop1
> > > +.Lxctr_ret:
> > > +     FRAME_END
> > > +     RET
> > > +SYM_FUNC_END(aesni_xctr_enc)
> > > +
> > > +#endif
> >
> > Sorry, I missed this file.  This is the non-AVX version, right?  That means that
> > AVX instructions, i.e. basically anything instruction starting with "v", can't
> > be used here.  So the above isn't going to work.  (There might be a way to test
> > this with QEMU; maybe --cpu-type=Nehalem without --enable-kvm?)
> >
> > You could rewrite this without using AVX instructions.  However, polyval-clmulni
> > is broken in the same way; it uses AVX instructions without checking whether
> > they are available.  But your patchset doesn't aim to provide a non-AVX polyval
> > implementation at all.  So even if you got the non-AVX XCTR working, it wouldn't
> > be paired with an accelerated polyval.
> >
> > So I think you should just not provide non-AVX versions for now.  That would
> > mean:
> >
> >         1.) Drop the change to aesni-intel_asm.S
> >         2.) Don't register the AES XCTR algorithm unless AVX is available
> >             (in addition to AES-NI)
> 
> Is there a preferred way to conditionally register xctr? It looks like
> aesni-intel_glue.c registers a default implementation for all the
> algorithms in the array, then better versions are enabled depending on
> cpu features. Should I remove xctr from the list of other algorithms
> and register it separately?
> 

Yes, it will need to be removed from the aesni_skciphers array.  I don't see any
other algorithms in that file that are conditional on AES-NI && AVX, so it will
have to go by itself.

- Eric
diff mbox series

Patch

diff --git a/arch/x86/crypto/aes_ctrby8_avx-x86_64.S b/arch/x86/crypto/aes_ctrby8_avx-x86_64.S
index 43852ba6e19c..9e20d7d3d6da 100644
--- a/arch/x86/crypto/aes_ctrby8_avx-x86_64.S
+++ b/arch/x86/crypto/aes_ctrby8_avx-x86_64.S
@@ -53,6 +53,10 @@ 
 #define KEY_192		2
 #define KEY_256		3
 
+// XCTR mode only
+#define counter		%r9
+#define xiv		%xmm8
+
 .section .rodata
 .align 16
 
@@ -102,38 +106,67 @@  ddq_add_8:
  * do_aes num_in_par load_keys key_len
  * This increments p_in, but not p_out
  */
-.macro do_aes b, k, key_len
+.macro do_aes b, k, key_len, xctr
 	.set by, \b
 	.set load_keys, \k
 	.set klen, \key_len
 
+	.if (\xctr == 1)
+		.set i, 0
+		.rept (by)
+			club XDATA, i
+			movq counter, var_xdata
+			.set i, (i +1)
+		.endr
+	.endif
+
 	.if (load_keys)
 		vmovdqa	0*16(p_keys), xkey0
 	.endif
 
-	vpshufb	xbyteswap, xcounter, xdata0
-
-	.set i, 1
-	.rept (by - 1)
-		club XDATA, i
-		vpaddq	(ddq_add_1 + 16 * (i - 1))(%rip), xcounter, var_xdata
-		vptest	ddq_low_msk(%rip), var_xdata
-		jnz 1f
-		vpaddq	ddq_high_add_1(%rip), var_xdata, var_xdata
-		vpaddq	ddq_high_add_1(%rip), xcounter, xcounter
-		1:
-		vpshufb	xbyteswap, var_xdata, var_xdata
-		.set i, (i +1)
-	.endr
+	.if (\xctr == 0)
+		vpshufb	xbyteswap, xcounter, xdata0
+		.set i, 1
+		.rept (by - 1)
+			club XDATA, i
+			vpaddq	(ddq_add_1 + 16 * (i - 1))(%rip), xcounter, var_xdata
+			vptest	ddq_low_msk(%rip), var_xdata
+			jnz 1f
+			vpaddq	ddq_high_add_1(%rip), var_xdata, var_xdata
+			vpaddq	ddq_high_add_1(%rip), xcounter, xcounter
+			1:
+			vpshufb	xbyteswap, var_xdata, var_xdata
+			.set i, (i +1)
+		.endr
+	.endif
+	.if (\xctr == 1)
+		.set i, 0
+		.rept (by)
+			club XDATA, i
+			vpaddq	(ddq_add_1 + 16 * i)(%rip), var_xdata, var_xdata
+			.set i, (i +1)
+		.endr
+		.set i, 0
+		.rept (by)
+			club	XDATA, i
+			vpxor	xiv, var_xdata, var_xdata
+			.set i, (i +1)
+		.endr
+	.endif
 
 	vmovdqa	1*16(p_keys), xkeyA
 
 	vpxor	xkey0, xdata0, xdata0
-	vpaddq	(ddq_add_1 + 16 * (by - 1))(%rip), xcounter, xcounter
-	vptest	ddq_low_msk(%rip), xcounter
-	jnz	1f
-	vpaddq	ddq_high_add_1(%rip), xcounter, xcounter
-	1:
+	.if (\xctr == 0)
+		vpaddq	(ddq_add_1 + 16 * (by - 1))(%rip), xcounter, xcounter
+		vptest	ddq_low_msk(%rip), xcounter
+		jnz	1f
+		vpaddq	ddq_high_add_1(%rip), xcounter, xcounter
+		1:
+	.endif
+	.if (\xctr == 1)
+		add $by, counter
+	.endif
 
 	.set i, 1
 	.rept (by - 1)
@@ -371,94 +404,101 @@  ddq_add_8:
 	.endr
 .endm
 
-.macro do_aes_load val, key_len
-	do_aes \val, 1, \key_len
+.macro do_aes_load val, key_len, xctr
+	do_aes \val, 1, \key_len, \xctr
 .endm
 
-.macro do_aes_noload val, key_len
-	do_aes \val, 0, \key_len
+.macro do_aes_noload val, key_len, xctr
+	do_aes \val, 0, \key_len, \xctr
 .endm
 
 /* main body of aes ctr load */
 
-.macro do_aes_ctrmain key_len
+.macro do_aes_ctrmain key_len, xctr
 	cmp	$16, num_bytes
-	jb	.Ldo_return2\key_len
+	jb	.Ldo_return2\xctr\key_len
 
 	vmovdqa	byteswap_const(%rip), xbyteswap
-	vmovdqu	(p_iv), xcounter
-	vpshufb	xbyteswap, xcounter, xcounter
+	.if (\xctr == 0)
+		vmovdqu	(p_iv), xcounter
+		vpshufb	xbyteswap, xcounter, xcounter
+	.endif
+	.if (\xctr == 1)
+		andq	$(~0xf), num_bytes
+		shr	$4, counter
+		vmovdqu	(p_iv), xiv
+	.endif
 
 	mov	num_bytes, tmp
 	and	$(7*16), tmp
-	jz	.Lmult_of_8_blks\key_len
+	jz	.Lmult_of_8_blks\xctr\key_len
 
 	/* 1 <= tmp <= 7 */
 	cmp	$(4*16), tmp
-	jg	.Lgt4\key_len
-	je	.Leq4\key_len
+	jg	.Lgt4\xctr\key_len
+	je	.Leq4\xctr\key_len
 
-.Llt4\key_len:
+.Llt4\xctr\key_len:
 	cmp	$(2*16), tmp
-	jg	.Leq3\key_len
-	je	.Leq2\key_len
+	jg	.Leq3\xctr\key_len
+	je	.Leq2\xctr\key_len
 
-.Leq1\key_len:
-	do_aes_load	1, \key_len
+.Leq1\xctr\key_len:
+	do_aes_load	1, \key_len, \xctr
 	add	$(1*16), p_out
 	and	$(~7*16), num_bytes
-	jz	.Ldo_return2\key_len
-	jmp	.Lmain_loop2\key_len
+	jz	.Ldo_return2\xctr\key_len
+	jmp	.Lmain_loop2\xctr\key_len
 
-.Leq2\key_len:
-	do_aes_load	2, \key_len
+.Leq2\xctr\key_len:
+	do_aes_load	2, \key_len, \xctr
 	add	$(2*16), p_out
 	and	$(~7*16), num_bytes
-	jz	.Ldo_return2\key_len
-	jmp	.Lmain_loop2\key_len
+	jz	.Ldo_return2\xctr\key_len
+	jmp	.Lmain_loop2\xctr\key_len
 
 
-.Leq3\key_len:
-	do_aes_load	3, \key_len
+.Leq3\xctr\key_len:
+	do_aes_load	3, \key_len, \xctr
 	add	$(3*16), p_out
 	and	$(~7*16), num_bytes
-	jz	.Ldo_return2\key_len
-	jmp	.Lmain_loop2\key_len
+	jz	.Ldo_return2\xctr\key_len
+	jmp	.Lmain_loop2\xctr\key_len
 
-.Leq4\key_len:
-	do_aes_load	4, \key_len
+.Leq4\xctr\key_len:
+	do_aes_load	4, \key_len, \xctr
 	add	$(4*16), p_out
 	and	$(~7*16), num_bytes
-	jz	.Ldo_return2\key_len
-	jmp	.Lmain_loop2\key_len
+	jz	.Ldo_return2\xctr\key_len
+	jmp	.Lmain_loop2\xctr\key_len
 
-.Lgt4\key_len:
+.Lgt4\xctr\key_len:
 	cmp	$(6*16), tmp
-	jg	.Leq7\key_len
-	je	.Leq6\key_len
+	jg	.Leq7\xctr\key_len
+	je	.Leq6\xctr\key_len
 
-.Leq5\key_len:
-	do_aes_load	5, \key_len
+.Leq5\xctr\key_len:
+	do_aes_load	5, \key_len, \xctr
 	add	$(5*16), p_out
 	and	$(~7*16), num_bytes
-	jz	.Ldo_return2\key_len
-	jmp	.Lmain_loop2\key_len
+	jz	.Ldo_return2\xctr\key_len
+	jmp	.Lmain_loop2\xctr\key_len
 
-.Leq6\key_len:
-	do_aes_load	6, \key_len
+.Leq6\xctr\key_len:
+	do_aes_load	6, \key_len, \xctr
 	add	$(6*16), p_out
 	and	$(~7*16), num_bytes
-	jz	.Ldo_return2\key_len
-	jmp	.Lmain_loop2\key_len
+	jz	.Ldo_return2\xctr\key_len
+	jmp	.Lmain_loop2\xctr\key_len
 
-.Leq7\key_len:
-	do_aes_load	7, \key_len
+.Leq7\xctr\key_len:
+	do_aes_load	7, \key_len, \xctr
 	add	$(7*16), p_out
 	and	$(~7*16), num_bytes
-	jz	.Ldo_return2\key_len
-	jmp	.Lmain_loop2\key_len
+	jz	.Ldo_return2\xctr\key_len
+	jmp	.Lmain_loop2\xctr\key_len
 
-.Lmult_of_8_blks\key_len:
+.Lmult_of_8_blks\xctr\key_len:
 	.if (\key_len != KEY_128)
 		vmovdqa	0*16(p_keys), xkey0
 		vmovdqa	4*16(p_keys), xkey4
@@ -471,17 +511,19 @@  ddq_add_8:
 		vmovdqa	9*16(p_keys), xkey12
 	.endif
 .align 16
-.Lmain_loop2\key_len:
+.Lmain_loop2\xctr\key_len:
 	/* num_bytes is a multiple of 8 and >0 */
-	do_aes_noload	8, \key_len
+	do_aes_noload	8, \key_len, \xctr
 	add	$(8*16), p_out
 	sub	$(8*16), num_bytes
-	jne	.Lmain_loop2\key_len
+	jne	.Lmain_loop2\xctr\key_len
 
-.Ldo_return2\key_len:
-	/* return updated IV */
-	vpshufb	xbyteswap, xcounter, xcounter
-	vmovdqu	xcounter, (p_iv)
+.Ldo_return2\xctr\key_len:
+	.if (\xctr == 0)
+		/* return updated IV */
+		vpshufb	xbyteswap, xcounter, xcounter
+		vmovdqu	xcounter, (p_iv)
+	.endif
 	RET
 .endm
 
@@ -494,7 +536,7 @@  ddq_add_8:
  */
 SYM_FUNC_START(aes_ctr_enc_128_avx_by8)
 	/* call the aes main loop */
-	do_aes_ctrmain KEY_128
+	do_aes_ctrmain KEY_128 0
 
 SYM_FUNC_END(aes_ctr_enc_128_avx_by8)
 
@@ -507,7 +549,7 @@  SYM_FUNC_END(aes_ctr_enc_128_avx_by8)
  */
 SYM_FUNC_START(aes_ctr_enc_192_avx_by8)
 	/* call the aes main loop */
-	do_aes_ctrmain KEY_192
+	do_aes_ctrmain KEY_192 0
 
 SYM_FUNC_END(aes_ctr_enc_192_avx_by8)
 
@@ -520,6 +562,45 @@  SYM_FUNC_END(aes_ctr_enc_192_avx_by8)
  */
 SYM_FUNC_START(aes_ctr_enc_256_avx_by8)
 	/* call the aes main loop */
-	do_aes_ctrmain KEY_256
+	do_aes_ctrmain KEY_256 0
 
 SYM_FUNC_END(aes_ctr_enc_256_avx_by8)
+
+/*
+ * routine to do AES128 XCTR enc/decrypt "by8"
+ * XMM registers are clobbered.
+ * Saving/restoring must be done at a higher level
+ * aes_xctr_enc_128_avx_by8(const u8 *in, const u8 *iv, const aes_ctx *keys, u8
+ * 			    *out, unsigned int num_bytes, unsigned int byte_ctr)
+ */
+SYM_FUNC_START(aes_xctr_enc_128_avx_by8)
+	/* call the aes main loop */
+	do_aes_ctrmain KEY_128 1
+
+SYM_FUNC_END(aes_xctr_enc_128_avx_by8)
+
+/*
+ * routine to do AES192 XCTR enc/decrypt "by8"
+ * XMM registers are clobbered.
+ * Saving/restoring must be done at a higher level
+ * aes_xctr_enc_192_avx_by8(const u8 *in, const u8 *iv, const aes_ctx *keys, u8
+ * 			    *out, unsigned int num_bytes, unsigned int byte_ctr)
+ */
+SYM_FUNC_START(aes_xctr_enc_192_avx_by8)
+	/* call the aes main loop */
+	do_aes_ctrmain KEY_192 1
+
+SYM_FUNC_END(aes_xctr_enc_192_avx_by8)
+
+/*
+ * routine to do AES256 XCTR enc/decrypt "by8"
+ * XMM registers are clobbered.
+ * Saving/restoring must be done at a higher level
+ * aes_xctr_enc_256_avx_by8(const u8 *in, const u8 *iv, const aes_ctx *keys, u8
+ * 			    *out, unsigned int num_bytes, unsigned int byte_ctr)
+ */
+SYM_FUNC_START(aes_xctr_enc_256_avx_by8)
+	/* call the aes main loop */
+	do_aes_ctrmain KEY_256 1
+
+SYM_FUNC_END(aes_xctr_enc_256_avx_by8)
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index 363699dd7220..ce17fe630150 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -2821,6 +2821,76 @@  SYM_FUNC_END(aesni_ctr_enc)
 
 #endif
 
+#ifdef __x86_64__
+/*
+ * void aesni_xctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
+ *		      size_t len, u8 *iv, int byte_ctr)
+ */
+SYM_FUNC_START(aesni_xctr_enc)
+	FRAME_BEGIN
+	cmp $16, LEN
+	jb .Lxctr_ret
+	shr	$4, %arg6
+	movq %arg6, CTR
+	mov 480(KEYP), KLEN
+	movups (IVP), IV
+	cmp $64, LEN
+	jb .Lxctr_enc_loop1
+.align 4
+.Lxctr_enc_loop4:
+	movaps IV, STATE1
+	vpaddq ONE(%rip), CTR, CTR
+	vpxor CTR, STATE1, STATE1
+	movups (INP), IN1
+	movaps IV, STATE2
+	vpaddq ONE(%rip), CTR, CTR
+	vpxor CTR, STATE2, STATE2
+	movups 0x10(INP), IN2
+	movaps IV, STATE3
+	vpaddq ONE(%rip), CTR, CTR
+	vpxor CTR, STATE3, STATE3
+	movups 0x20(INP), IN3
+	movaps IV, STATE4
+	vpaddq ONE(%rip), CTR, CTR
+	vpxor CTR, STATE4, STATE4
+	movups 0x30(INP), IN4
+	call _aesni_enc4
+	pxor IN1, STATE1
+	movups STATE1, (OUTP)
+	pxor IN2, STATE2
+	movups STATE2, 0x10(OUTP)
+	pxor IN3, STATE3
+	movups STATE3, 0x20(OUTP)
+	pxor IN4, STATE4
+	movups STATE4, 0x30(OUTP)
+	sub $64, LEN
+	add $64, INP
+	add $64, OUTP
+	cmp $64, LEN
+	jge .Lxctr_enc_loop4
+	cmp $16, LEN
+	jb .Lxctr_ret
+.align 4
+.Lxctr_enc_loop1:
+	movaps IV, STATE
+	vpaddq ONE(%rip), CTR, CTR
+	vpxor CTR, STATE1, STATE1
+	movups (INP), IN
+	call _aesni_enc1
+	pxor IN, STATE
+	movups STATE, (OUTP)
+	sub $16, LEN
+	add $16, INP
+	add $16, OUTP
+	cmp $16, LEN
+	jge .Lxctr_enc_loop1
+.Lxctr_ret:
+	FRAME_END
+	RET
+SYM_FUNC_END(aesni_xctr_enc)
+
+#endif
+
 .section	.rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
 .align 16
 .Lgf128mul_x_ble_mask:
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 41901ba9d3a2..74021bd524b6 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -112,6 +112,11 @@  asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out,
 			      const u8 *in, unsigned int len, u8 *iv);
 DEFINE_STATIC_CALL(aesni_ctr_enc_tfm, aesni_ctr_enc);
 
+asmlinkage void aesni_xctr_enc(struct crypto_aes_ctx *ctx, u8 *out,
+			       const u8 *in, unsigned int len, u8 *iv,
+			       unsigned int byte_ctr);
+DEFINE_STATIC_CALL(aesni_xctr_enc_tfm, aesni_xctr_enc);
+
 /* Scatter / Gather routines, with args similar to above */
 asmlinkage void aesni_gcm_init(void *ctx,
 			       struct gcm_context_data *gdata,
@@ -135,6 +140,16 @@  asmlinkage void aes_ctr_enc_192_avx_by8(const u8 *in, u8 *iv,
 		void *keys, u8 *out, unsigned int num_bytes);
 asmlinkage void aes_ctr_enc_256_avx_by8(const u8 *in, u8 *iv,
 		void *keys, u8 *out, unsigned int num_bytes);
+
+asmlinkage void aes_xctr_enc_128_avx_by8(const u8 *in, u8 *iv, void *keys, u8
+	*out, unsigned int num_bytes, unsigned int byte_ctr);
+
+asmlinkage void aes_xctr_enc_192_avx_by8(const u8 *in, u8 *iv, void *keys, u8
+	*out, unsigned int num_bytes, unsigned int byte_ctr);
+
+asmlinkage void aes_xctr_enc_256_avx_by8(const u8 *in, u8 *iv, void *keys, u8
+	*out, unsigned int num_bytes, unsigned int byte_ctr);
+
 /*
  * asmlinkage void aesni_gcm_init_avx_gen2()
  * gcm_data *my_ctx_data, context data
@@ -527,6 +542,61 @@  static int ctr_crypt(struct skcipher_request *req)
 	return err;
 }
 
+static void aesni_xctr_enc_avx_tfm(struct crypto_aes_ctx *ctx, u8 *out, const u8
+				   *in, unsigned int len, u8 *iv, unsigned int
+				   byte_ctr)
+{
+	if (ctx->key_length == AES_KEYSIZE_128)
+		aes_xctr_enc_128_avx_by8(in, iv, (void *)ctx, out, len,
+					 byte_ctr);
+	else if (ctx->key_length == AES_KEYSIZE_192)
+		aes_xctr_enc_192_avx_by8(in, iv, (void *)ctx, out, len,
+					 byte_ctr);
+	else
+		aes_xctr_enc_256_avx_by8(in, iv, (void *)ctx, out, len,
+					 byte_ctr);
+}
+
+static int xctr_crypt(struct skcipher_request *req)
+{
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm));
+	u8 keystream[AES_BLOCK_SIZE];
+	u8 ctr[AES_BLOCK_SIZE];
+	struct skcipher_walk walk;
+	unsigned int nbytes;
+	unsigned int byte_ctr = 0;
+	int err;
+	__le32 ctr32;
+
+	err = skcipher_walk_virt(&walk, req, false);
+
+	while ((nbytes = walk.nbytes) > 0) {
+		kernel_fpu_begin();
+		if (nbytes & AES_BLOCK_MASK)
+			static_call(aesni_xctr_enc_tfm)(ctx, walk.dst.virt.addr,
+				walk.src.virt.addr, nbytes & AES_BLOCK_MASK,
+				walk.iv, byte_ctr);
+		nbytes &= ~AES_BLOCK_MASK;
+		byte_ctr += walk.nbytes - nbytes;
+
+		if (walk.nbytes == walk.total && nbytes > 0) {
+			ctr32 = cpu_to_le32(byte_ctr / AES_BLOCK_SIZE + 1);
+			memcpy(ctr, walk.iv, AES_BLOCK_SIZE);
+			crypto_xor(ctr, (u8 *)&ctr32, sizeof(ctr32));
+			aesni_enc(ctx, keystream, ctr);
+			crypto_xor_cpy(walk.dst.virt.addr + walk.nbytes -
+				       nbytes, walk.src.virt.addr + walk.nbytes
+				       - nbytes, keystream, nbytes);
+			byte_ctr += nbytes;
+			nbytes = 0;
+		}
+		kernel_fpu_end();
+		err = skcipher_walk_done(&walk, nbytes);
+	}
+	return err;
+}
+
 static int
 rfc4106_set_hash_subkey(u8 *hash_subkey, const u8 *key, unsigned int key_len)
 {
@@ -1026,6 +1096,23 @@  static struct skcipher_alg aesni_skciphers[] = {
 		.setkey		= aesni_skcipher_setkey,
 		.encrypt	= ctr_crypt,
 		.decrypt	= ctr_crypt,
+	}, {
+		.base = {
+			.cra_name		= "__xctr(aes)",
+			.cra_driver_name	= "__xctr-aes-aesni",
+			.cra_priority		= 400,
+			.cra_flags		= CRYPTO_ALG_INTERNAL,
+			.cra_blocksize		= 1,
+			.cra_ctxsize		= CRYPTO_AES_CTX_SIZE,
+			.cra_module		= THIS_MODULE,
+		},
+		.min_keysize	= AES_MIN_KEY_SIZE,
+		.max_keysize	= AES_MAX_KEY_SIZE,
+		.ivsize		= AES_BLOCK_SIZE,
+		.chunksize	= AES_BLOCK_SIZE,
+		.setkey		= aesni_skcipher_setkey,
+		.encrypt	= xctr_crypt,
+		.decrypt	= xctr_crypt,
 #endif
 	}, {
 		.base = {
@@ -1162,6 +1249,8 @@  static int __init aesni_init(void)
 		/* optimize performance of ctr mode encryption transform */
 		static_call_update(aesni_ctr_enc_tfm, aesni_ctr_enc_avx_tfm);
 		pr_info("AES CTR mode by8 optimization enabled\n");
+		static_call_update(aesni_xctr_enc_tfm, aesni_xctr_enc_avx_tfm);
+		pr_info("AES XCTR mode by8 optimization enabled\n");
 	}
 #endif
 
diff --git a/crypto/Kconfig b/crypto/Kconfig
index 0dedba74db4a..aa06af0e0ebe 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -1161,7 +1161,7 @@  config CRYPTO_AES_NI_INTEL
 	  In addition to AES cipher algorithm support, the acceleration
 	  for some popular block cipher mode is supported too, including
 	  ECB, CBC, LRW, XTS. The 64 bit version has additional
-	  acceleration for CTR.
+	  acceleration for CTR and XCTR.
 
 config CRYPTO_AES_SPARC64
 	tristate "AES cipher algorithms (SPARC64)"