@@ -212,23 +212,36 @@
ushr XL.2d, XL.2d, #1
.endm
- .macro __pmull_ghash, pn
- ld1 {SHASH.2d}, [x3]
- ld1 {XL.2d}, [x1]
+ .macro __pmull_ghash, pn, yield
+ stp x29, x30, [sp, #-64]!
+ mov x29, sp
+ stp x19, x20, [sp, #16]
+ stp x21, x22, [sp, #32]
+ str x23, [sp, #48]
+
+ mov x19, x0
+ mov x20, x1
+ mov x21, x2
+ mov x22, x3
+ mov x23, x4
+
+0: ld1 {SHASH.2d}, [x22]
+ ld1 {XL.2d}, [x20]
ext SHASH2.16b, SHASH.16b, SHASH.16b, #8
eor SHASH2.16b, SHASH2.16b, SHASH.16b
__pmull_pre_\pn
/* do the head block first, if supplied */
- cbz x4, 0f
- ld1 {T1.2d}, [x4]
- b 1f
+ cbz x23, 1f
+ ld1 {T1.2d}, [x23]
+ mov x23, xzr
+ b 2f
-0: ld1 {T1.2d}, [x2], #16
- sub w0, w0, #1
+1: ld1 {T1.2d}, [x21], #16
+ sub w19, w19, #1
-1: /* multiply XL by SHASH in GF(2^128) */
+2: /* multiply XL by SHASH in GF(2^128) */
CPU_LE( rev64 T1.16b, T1.16b )
ext T2.16b, XL.16b, XL.16b, #8
@@ -250,9 +263,19 @@ CPU_LE( rev64 T1.16b, T1.16b )
eor T2.16b, T2.16b, XH.16b
eor XL.16b, XL.16b, T2.16b
- cbnz w0, 0b
+ cbz w19, 3f
- st1 {XL.2d}, [x1]
+ yield_neon_pre w19, \yield, 1, 1b
+ st1 {XL.2d}, [x20]
+ yield_neon_post 0b
+
+ b 1b
+
+3: st1 {XL.2d}, [x20]
+ ldp x19, x20, [sp, #16]
+ ldp x21, x22, [sp, #32]
+ ldr x23, [sp, #48]
+ ldp x29, x30, [sp], #64
ret
.endm
@@ -261,11 +284,11 @@ CPU_LE( rev64 T1.16b, T1.16b )
* struct ghash_key const *k, const char *head)
*/
ENTRY(pmull_ghash_update_p64)
- __pmull_ghash p64
+ __pmull_ghash p64, 5
ENDPROC(pmull_ghash_update_p64)
ENTRY(pmull_ghash_update_p8)
- __pmull_ghash p8
+ __pmull_ghash p8, 2
ENDPROC(pmull_ghash_update_p8)
KS .req v8
@@ -304,38 +327,56 @@ ENDPROC(pmull_ghash_update_p8)
.endm
.macro pmull_gcm_do_crypt, enc
- ld1 {SHASH.2d}, [x4]
- ld1 {XL.2d}, [x1]
- ldr x8, [x5, #8] // load lower counter
+ stp x29, x30, [sp, #-96]!
+ mov x29, sp
+ stp x19, x20, [sp, #16]
+ stp x21, x22, [sp, #32]
+ stp x23, x24, [sp, #48]
+ stp x25, x26, [sp, #64]
+ str x27, [sp, #80]
+
+ mov x19, x0
+ mov x20, x1
+ mov x21, x2
+ mov x22, x3
+ mov x23, x4
+ mov x24, x5
+ mov x25, x6
+ mov x26, x7
+
+ ldr x27, [x24, #8] // load lower counter
+CPU_LE( rev x27, x27 )
+
+0: ld1 {SHASH.2d}, [x23]
+ ld1 {XL.2d}, [x20]
movi MASK.16b, #0xe1
ext SHASH2.16b, SHASH.16b, SHASH.16b, #8
-CPU_LE( rev x8, x8 )
shl MASK.2d, MASK.2d, #57
eor SHASH2.16b, SHASH2.16b, SHASH.16b
.if \enc == 1
- ld1 {KS.16b}, [x7]
+ ld1 {KS.16b}, [x26]
.endif
-0: ld1 {CTR.8b}, [x5] // load upper counter
- ld1 {INP.16b}, [x3], #16
- rev x9, x8
- add x8, x8, #1
- sub w0, w0, #1
+1: ld1 {CTR.8b}, [x24] // load upper counter
+ ld1 {INP.16b}, [x22], #16
+ rev x9, x27
+ add x27, x27, #1
+ sub w19, w19, #1
ins CTR.d[1], x9 // set lower counter
.if \enc == 1
eor INP.16b, INP.16b, KS.16b // encrypt input
- st1 {INP.16b}, [x2], #16
+ st1 {INP.16b}, [x21], #16
.endif
rev64 T1.16b, INP.16b
- cmp w6, #12
- b.ge 2f // AES-192/256?
+ cmp w25, #12
+ b.ge 4f // AES-192/256?
-1: enc_round CTR, v21
+2: enc_round CTR, v21
ext T2.16b, XL.16b, XL.16b, #8
ext IN1.16b, T1.16b, T1.16b, #8
@@ -390,27 +431,42 @@ CPU_LE( rev x8, x8 )
.if \enc == 0
eor INP.16b, INP.16b, KS.16b
- st1 {INP.16b}, [x2], #16
+ st1 {INP.16b}, [x21], #16
.endif
- cbnz w0, 0b
+ cbz w19, 3f
-CPU_LE( rev x8, x8 )
- st1 {XL.2d}, [x1]
- str x8, [x5, #8] // store lower counter
+ yield_neon_pre w19, 8, 1, 1b // yield every 8 blocks
+ st1 {XL.2d}, [x20]
+ .if \enc == 1
+ st1 {KS.16b}, [x26]
+ .endif
+ yield_neon_post 0b
+ b 1b
+
+3: st1 {XL.2d}, [x20]
.if \enc == 1
- st1 {KS.16b}, [x7]
+ st1 {KS.16b}, [x26]
.endif
+CPU_LE( rev x27, x27 )
+ str x27, [x24, #8] // store lower counter
+
+ ldp x19, x20, [sp, #16]
+ ldp x21, x22, [sp, #32]
+ ldp x23, x24, [sp, #48]
+ ldp x25, x26, [sp, #64]
+ ldr x27, [sp, #80]
+ ldp x29, x30, [sp], #96
ret
-2: b.eq 3f // AES-192?
+4: b.eq 5f // AES-192?
enc_round CTR, v17
enc_round CTR, v18
-3: enc_round CTR, v19
+5: enc_round CTR, v19
enc_round CTR, v20
- b 1b
+ b 2b
.endm
/*
This updates both the core GHASH as well as the AES-GCM algorithm to yield each time after processing a fixed chunk of input. For the GCM driver, we align with the other AES/CE block mode drivers, and use a block size of 64 bytes. The core GHASH is much shorter, so let's use a block size of 128 bytes for that one. Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> --- arch/arm64/crypto/ghash-ce-core.S | 128 ++++++++++++++------ 1 file changed, 92 insertions(+), 36 deletions(-) -- 2.11.0