aboutsummaryrefslogtreecommitdiff
path: root/sys/crypto/openssl/aarch64
diff options
context:
space:
mode:
Diffstat (limited to 'sys/crypto/openssl/aarch64')
-rw-r--r--sys/crypto/openssl/aarch64/aes-gcm-armv8-unroll8_64.S8488
-rw-r--r--sys/crypto/openssl/aarch64/aes-gcm-armv8_64.S1
-rw-r--r--sys/crypto/openssl/aarch64/aesv8-armx.S731
-rw-r--r--sys/crypto/openssl/aarch64/arm64cpuid.S137
-rw-r--r--sys/crypto/openssl/aarch64/armv8-mont.S1
-rw-r--r--sys/crypto/openssl/aarch64/bsaes-armv8.S2356
-rw-r--r--sys/crypto/openssl/aarch64/chacha-armv8-sve.S3559
-rw-r--r--sys/crypto/openssl/aarch64/chacha-armv8.S59
-rw-r--r--sys/crypto/openssl/aarch64/ecp_nistz256-armv8.S100
-rw-r--r--sys/crypto/openssl/aarch64/ecp_sm2p256-armv8.S837
-rw-r--r--sys/crypto/openssl/aarch64/ghashv8-armx.S105
-rw-r--r--sys/crypto/openssl/aarch64/keccak1600-armv8.S14
-rw-r--r--sys/crypto/openssl/aarch64/md5-aarch64.S678
-rw-r--r--sys/crypto/openssl/aarch64/poly1305-armv8.S20
-rw-r--r--sys/crypto/openssl/aarch64/sha1-armv8.S6
-rw-r--r--sys/crypto/openssl/aarch64/sha256-armv8.S15
-rw-r--r--sys/crypto/openssl/aarch64/sha512-armv8.S12
-rw-r--r--sys/crypto/openssl/aarch64/sm3-armv8.S509
-rw-r--r--sys/crypto/openssl/aarch64/sm4-armv8.S1093
-rw-r--r--sys/crypto/openssl/aarch64/vpaes-armv8.S53
-rw-r--r--sys/crypto/openssl/aarch64/vpsm4-armv8.S5021
-rw-r--r--sys/crypto/openssl/aarch64/vpsm4_ex-armv8.S4523
22 files changed, 28229 insertions, 89 deletions
diff --git a/sys/crypto/openssl/aarch64/aes-gcm-armv8-unroll8_64.S b/sys/crypto/openssl/aarch64/aes-gcm-armv8-unroll8_64.S
new file mode 100644
index 000000000000..61e9326175d0
--- /dev/null
+++ b/sys/crypto/openssl/aarch64/aes-gcm-armv8-unroll8_64.S
@@ -0,0 +1,8488 @@
+/* Do not modify. This file is auto-generated from aes-gcm-armv8-unroll8_64.pl. */
+#include "arm_arch.h"
+
+#if __ARM_MAX_ARCH__>=8
+.arch armv8-a+crypto
+.text
+.globl unroll8_eor3_aes_gcm_enc_128_kernel
+.type unroll8_eor3_aes_gcm_enc_128_kernel,%function
+.align 4
+unroll8_eor3_aes_gcm_enc_128_kernel:
+ AARCH64_VALID_CALL_TARGET
+ cbz x1, .L128_enc_ret
+ stp d8, d9, [sp, #-80]!
+ lsr x9, x1, #3
+ mov x16, x4
+ mov x8, x5
+ stp d10, d11, [sp, #16]
+ stp d12, d13, [sp, #32]
+ stp d14, d15, [sp, #48]
+ mov x5, #0xc200000000000000
+ stp x5, xzr, [sp, #64]
+ add x10, sp, #64
+
+ mov x15, #0x100000000 //set up counter increment
+ movi v31.16b, #0x0
+ mov v31.d[1], x15
+ mov x5, x9
+ ld1 { v0.16b}, [x16] //CTR block 0
+
+ sub x5, x5, #1 //byte_len - 1
+
+ and x5, x5, #0xffffffffffffff80 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
+
+ rev32 v30.16b, v0.16b //set up reversed counter
+
+ add v30.4s, v30.4s, v31.4s //CTR block 0
+
+ rev32 v1.16b, v30.16b //CTR block 1
+ add v30.4s, v30.4s, v31.4s //CTR block 1
+
+ rev32 v2.16b, v30.16b //CTR block 2
+ add v30.4s, v30.4s, v31.4s //CTR block 2
+
+ rev32 v3.16b, v30.16b //CTR block 3
+ add v30.4s, v30.4s, v31.4s //CTR block 3
+
+ rev32 v4.16b, v30.16b //CTR block 4
+ add v30.4s, v30.4s, v31.4s //CTR block 4
+
+ rev32 v5.16b, v30.16b //CTR block 5
+ add v30.4s, v30.4s, v31.4s //CTR block 5
+ ldp q26, q27, [x8, #0] //load rk0, rk1
+
+ rev32 v6.16b, v30.16b //CTR block 6
+ add v30.4s, v30.4s, v31.4s //CTR block 6
+
+ rev32 v7.16b, v30.16b //CTR block 7
+ add v30.4s, v30.4s, v31.4s //CTR block 7
+
+ aese v4.16b, v26.16b
+ aesmc v4.16b, v4.16b //AES block 4 - round 0
+ aese v6.16b, v26.16b
+ aesmc v6.16b, v6.16b //AES block 6 - round 0
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b //AES block 3 - round 0
+
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b //AES block 0 - round 0
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b //AES block 1 - round 0
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b //AES block 2 - round 0
+
+ aese v7.16b, v26.16b
+ aesmc v7.16b, v7.16b //AES block 7 - round 0
+ aese v5.16b, v26.16b
+ aesmc v5.16b, v5.16b //AES block 5 - round 0
+ ldp q28, q26, [x8, #32] //load rk2, rk3
+
+ aese v3.16b, v27.16b
+ aesmc v3.16b, v3.16b //AES block 3 - round 1
+
+ aese v7.16b, v27.16b
+ aesmc v7.16b, v7.16b //AES block 7 - round 1
+ aese v5.16b, v27.16b
+ aesmc v5.16b, v5.16b //AES block 5 - round 1
+ aese v4.16b, v27.16b
+ aesmc v4.16b, v4.16b //AES block 4 - round 1
+
+ aese v2.16b, v27.16b
+ aesmc v2.16b, v2.16b //AES block 2 - round 1
+ aese v6.16b, v27.16b
+ aesmc v6.16b, v6.16b //AES block 6 - round 1
+ aese v0.16b, v27.16b
+ aesmc v0.16b, v0.16b //AES block 0 - round 1
+
+ aese v5.16b, v28.16b
+ aesmc v5.16b, v5.16b //AES block 5 - round 2
+ aese v1.16b, v27.16b
+ aesmc v1.16b, v1.16b //AES block 1 - round 1
+ aese v0.16b, v28.16b
+ aesmc v0.16b, v0.16b //AES block 0 - round 2
+
+ aese v2.16b, v28.16b
+ aesmc v2.16b, v2.16b //AES block 2 - round 2
+ aese v3.16b, v28.16b
+ aesmc v3.16b, v3.16b //AES block 3 - round 2
+ aese v7.16b, v28.16b
+ aesmc v7.16b, v7.16b //AES block 7 - round 2
+
+ aese v1.16b, v28.16b
+ aesmc v1.16b, v1.16b //AES block 1 - round 2
+ aese v6.16b, v28.16b
+ aesmc v6.16b, v6.16b //AES block 6 - round 2
+ aese v4.16b, v28.16b
+ aesmc v4.16b, v4.16b //AES block 4 - round 2
+
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b //AES block 2 - round 3
+
+ ldp q27, q28, [x8, #64] //load rk4, rk5
+ aese v5.16b, v26.16b
+ aesmc v5.16b, v5.16b //AES block 5 - round 3
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b //AES block 0 - round 3
+
+ aese v4.16b, v26.16b
+ aesmc v4.16b, v4.16b //AES block 4 - round 3
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b //AES block 3 - round 3
+ aese v6.16b, v26.16b
+ aesmc v6.16b, v6.16b //AES block 6 - round 3
+
+ aese v7.16b, v26.16b
+ aesmc v7.16b, v7.16b //AES block 7 - round 3
+
+ aese v6.16b, v27.16b
+ aesmc v6.16b, v6.16b //AES block 6 - round 4
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b //AES block 1 - round 3
+ aese v5.16b, v27.16b
+ aesmc v5.16b, v5.16b //AES block 5 - round 4
+
+ aese v7.16b, v27.16b
+ aesmc v7.16b, v7.16b //AES block 7 - round 4
+ aese v4.16b, v27.16b
+ aesmc v4.16b, v4.16b //AES block 4 - round 4
+ aese v0.16b, v27.16b
+ aesmc v0.16b, v0.16b //AES block 0 - round 4
+
+ aese v1.16b, v27.16b
+ aesmc v1.16b, v1.16b //AES block 1 - round 4
+ aese v2.16b, v27.16b
+ aesmc v2.16b, v2.16b //AES block 2 - round 4
+ aese v3.16b, v27.16b
+ aesmc v3.16b, v3.16b //AES block 3 - round 4
+
+ aese v7.16b, v28.16b
+ aesmc v7.16b, v7.16b //AES block 7 - round 5
+ aese v0.16b, v28.16b
+ aesmc v0.16b, v0.16b //AES block 0 - round 5
+ ldp q26, q27, [x8, #96] //load rk6, rk7
+
+ aese v1.16b, v28.16b
+ aesmc v1.16b, v1.16b //AES block 1 - round 5
+ aese v3.16b, v28.16b
+ aesmc v3.16b, v3.16b //AES block 3 - round 5
+ aese v2.16b, v28.16b
+ aesmc v2.16b, v2.16b //AES block 2 - round 5
+
+ aese v4.16b, v28.16b
+ aesmc v4.16b, v4.16b //AES block 4 - round 5
+ aese v5.16b, v28.16b
+ aesmc v5.16b, v5.16b //AES block 5 - round 5
+ aese v6.16b, v28.16b
+ aesmc v6.16b, v6.16b //AES block 6 - round 5
+
+ aese v4.16b, v26.16b
+ aesmc v4.16b, v4.16b //AES block 4 - round 6
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b //AES block 3 - round 6
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b //AES block 2 - round 6
+
+ aese v7.16b, v26.16b
+ aesmc v7.16b, v7.16b //AES block 7 - round 6
+ aese v6.16b, v26.16b
+ aesmc v6.16b, v6.16b //AES block 6 - round 6
+ aese v5.16b, v26.16b
+ aesmc v5.16b, v5.16b //AES block 5 - round 6
+
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b //AES block 0 - round 6
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b //AES block 1 - round 6
+ ldp q28, q26, [x8, #128] //load rk8, rk9
+
+ aese v5.16b, v27.16b
+ aesmc v5.16b, v5.16b //AES block 5 - round 7
+
+ ld1 { v19.16b}, [x3]
+ ext v19.16b, v19.16b, v19.16b, #8
+ rev64 v19.16b, v19.16b
+
+ aese v7.16b, v27.16b
+ aesmc v7.16b, v7.16b //AES block 7 - round 7
+
+ aese v4.16b, v27.16b
+ aesmc v4.16b, v4.16b //AES block 4 - round 7
+ aese v3.16b, v27.16b
+ aesmc v3.16b, v3.16b //AES block 3 - round 7
+ aese v6.16b, v27.16b
+ aesmc v6.16b, v6.16b //AES block 6 - round 7
+
+ aese v1.16b, v27.16b
+ aesmc v1.16b, v1.16b //AES block 1 - round 7
+ aese v2.16b, v27.16b
+ aesmc v2.16b, v2.16b //AES block 2 - round 7
+ aese v0.16b, v27.16b
+ aesmc v0.16b, v0.16b //AES block 0 - round 7
+
+ aese v3.16b, v28.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 8
+ aese v6.16b, v28.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 8
+ aese v2.16b, v28.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 8
+
+ aese v7.16b, v28.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 8
+ aese v0.16b, v28.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 8
+ ldr q27, [x8, #160] //load rk10
+
+ aese v3.16b, v26.16b //AES block 8k+11 - round 9
+ aese v4.16b, v28.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 8
+ aese v2.16b, v26.16b //AES block 8k+10 - round 9
+
+ aese v5.16b, v28.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 8
+ aese v1.16b, v28.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 8
+ aese v6.16b, v26.16b //AES block 8k+14 - round 9
+
+ aese v4.16b, v26.16b //AES block 8k+12 - round 9
+ add x5, x5, x0
+ aese v0.16b, v26.16b //AES block 8k+8 - round 9
+
+ aese v7.16b, v26.16b //AES block 8k+15 - round 9
+ aese v5.16b, v26.16b //AES block 8k+13 - round 9
+ aese v1.16b, v26.16b //AES block 8k+9 - round 9
+
+ add x4, x0, x1, lsr #3 //end_input_ptr
+ cmp x0, x5 //check if we have <= 8 blocks
+ b.ge .L128_enc_tail //handle tail
+
+ ldp q8, q9, [x0], #32 //AES block 0, 1 - load plaintext
+
+ ldp q10, q11, [x0], #32 //AES block 2, 3 - load plaintext
+
+ ldp q12, q13, [x0], #32 //AES block 4, 5 - load plaintext
+
+ ldp q14, q15, [x0], #32 //AES block 6, 7 - load plaintext
+ cmp x0, x5 //check if we have <= 8 blocks
+
+.inst 0xce006d08 //eor3 v8.16b, v8.16b, v0.16b, v27.16b //AES block 0 - result
+ rev32 v0.16b, v30.16b //CTR block 8
+ add v30.4s, v30.4s, v31.4s //CTR block 8
+
+.inst 0xce016d29 //eor3 v9.16b, v9.16b, v1.16b, v27.16b //AES block 1 - result
+ stp q8, q9, [x2], #32 //AES block 0, 1 - store result
+
+ rev32 v1.16b, v30.16b //CTR block 9
+.inst 0xce056dad //eor3 v13.16b, v13.16b, v5.16b, v27.16b //AES block 5 - result
+ add v30.4s, v30.4s, v31.4s //CTR block 9
+
+.inst 0xce026d4a //eor3 v10.16b, v10.16b, v2.16b, v27.16b //AES block 2 - result
+.inst 0xce066dce //eor3 v14.16b, v14.16b, v6.16b, v27.16b //AES block 6 - result
+.inst 0xce046d8c //eor3 v12.16b, v12.16b, v4.16b, v27.16b //AES block 4 - result
+
+ rev32 v2.16b, v30.16b //CTR block 10
+ add v30.4s, v30.4s, v31.4s //CTR block 10
+
+.inst 0xce036d6b //eor3 v11.16b, v11.16b, v3.16b, v27.16b //AES block 3 - result
+.inst 0xce076def //eor3 v15.16b, v15.16b, v7.16b,v27.16b //AES block 7 - result
+ stp q10, q11, [x2], #32 //AES block 2, 3 - store result
+
+ rev32 v3.16b, v30.16b //CTR block 11
+ add v30.4s, v30.4s, v31.4s //CTR block 11
+ stp q12, q13, [x2], #32 //AES block 4, 5 - store result
+
+ stp q14, q15, [x2], #32 //AES block 6, 7 - store result
+
+ rev32 v4.16b, v30.16b //CTR block 12
+ add v30.4s, v30.4s, v31.4s //CTR block 12
+ b.ge .L128_enc_prepretail //do prepretail
+
+.L128_enc_main_loop: //main loop start
+ rev32 v5.16b, v30.16b //CTR block 8k+13
+ ldr q20, [x3, #128] //load h5l | h5h
+ ext v20.16b, v20.16b, v20.16b, #8
+ ldr q22, [x3, #160] //load h6l | h6h
+ ext v22.16b, v22.16b, v22.16b, #8
+ add v30.4s, v30.4s, v31.4s //CTR block 8k+13
+
+ rev64 v9.16b, v9.16b //GHASH block 8k+1
+ rev64 v8.16b, v8.16b //GHASH block 8k
+ ldr q23, [x3, #176] //load h7l | h7h
+ ext v23.16b, v23.16b, v23.16b, #8
+ ldr q25, [x3, #208] //load h8l | h8h
+ ext v25.16b, v25.16b, v25.16b, #8
+
+ rev32 v6.16b, v30.16b //CTR block 8k+14
+ add v30.4s, v30.4s, v31.4s //CTR block 8k+14
+ ext v19.16b, v19.16b, v19.16b, #8 //PRE 0
+
+ ldr q21, [x3, #144] //load h6k | h5k
+ ldr q24, [x3, #192] //load h8k | h7k
+ rev64 v13.16b, v13.16b //GHASH block 8k+5 (t0, t1, t2 and t3 free)
+ rev64 v11.16b, v11.16b //GHASH block 8k+3
+
+ ldp q26, q27, [x8, #0] //load rk0, rk1
+ eor v8.16b, v8.16b, v19.16b //PRE 1
+ rev32 v7.16b, v30.16b //CTR block 8k+15
+
+ rev64 v15.16b, v15.16b //GHASH block 8k+7 (t0, t1, t2 and t3 free)
+
+ pmull2 v16.1q, v9.2d, v23.2d //GHASH block 8k+1 - high
+ rev64 v10.16b, v10.16b //GHASH block 8k+2
+ pmull2 v17.1q, v8.2d, v25.2d //GHASH block 8k - high
+
+ pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low
+ trn1 v18.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
+ pmull v19.1q, v8.1d, v25.1d //GHASH block 8k - low
+
+ trn2 v8.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
+ pmull2 v29.1q, v10.2d, v22.2d //GHASH block 8k+2 - high
+ pmull2 v9.1q, v11.2d, v20.2d //GHASH block 8k+3 - high
+
+ eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low
+ ldr q23, [x3, #80] //load h3l | h3h
+ ext v23.16b, v23.16b, v23.16b, #8
+ ldr q25, [x3, #112] //load h3l | h3h
+ ext v25.16b, v25.16b, v25.16b, #8
+ aese v5.16b, v26.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 0
+
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 0
+ aese v4.16b, v26.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 0
+ eor v17.16b, v17.16b, v16.16b //GHASH block 8k+1 - high
+
+ add v30.4s, v30.4s, v31.4s //CTR block 8k+15
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 0
+ eor v8.16b, v8.16b, v18.16b //GHASH block 8k, 8k+1 - mid
+
+ aese v6.16b, v26.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 0
+ aese v1.16b, v27.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 1
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 0
+
+ aese v2.16b, v27.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 1
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 0
+ pmull v22.1q, v10.1d, v22.1d //GHASH block 8k+2 - low
+
+ aese v5.16b, v27.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 1
+ aese v7.16b, v26.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 0
+ aese v0.16b, v27.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 1
+
+.inst 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b,v9.16b //GHASH block 8k+2, 8k+3 - high
+ trn1 v29.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
+ trn2 v10.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
+
+ ldp q28, q26, [x8, #32] //load rk2, rk3
+ aese v4.16b, v27.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 1
+ aese v3.16b, v27.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 1
+
+ pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low
+ aese v7.16b, v27.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 1
+ aese v6.16b, v27.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 1
+
+ pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid
+ eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
+ pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid
+
+ rev64 v14.16b, v14.16b //GHASH block 8k+6 (t0, t1, and t2 free)
+.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low
+
+ pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid
+ eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid
+ pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid
+
+ aese v5.16b, v28.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 2
+ aese v4.16b, v28.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 2
+ aese v2.16b, v28.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 2
+
+ aese v1.16b, v28.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 2
+.inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
+ aese v6.16b, v28.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 2
+
+ aese v0.16b, v28.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 2
+ aese v3.16b, v28.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 2
+ aese v7.16b, v28.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 2
+
+ aese v6.16b, v26.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 3
+ ldr q21, [x3, #48] //load h2k | h1k
+ ldr q24, [x3, #96] //load h4k | h3k
+ rev64 v12.16b, v12.16b //GHASH block 8k+4 (t0, t1, and t2 free)
+
+ ldp q27, q28, [x8, #64] //load rk4, rk5
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 3
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 3
+
+ ldr q20, [x3, #32] //load h1l | h1h
+ ext v20.16b, v20.16b, v20.16b, #8
+ ldr q22, [x3, #64] //load h1l | h1h
+ ext v22.16b, v22.16b, v22.16b, #8
+ pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high
+ pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low
+
+ trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
+ trn2 v12.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
+
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 3
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 3
+
+ aese v7.16b, v26.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 3
+ aese v4.16b, v26.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 3
+
+ aese v5.16b, v26.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 3
+ aese v0.16b, v27.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 4
+
+ aese v7.16b, v27.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 4
+ aese v3.16b, v27.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 4
+ aese v4.16b, v27.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 4
+
+ aese v5.16b, v27.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 4
+ aese v6.16b, v27.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 4
+ aese v1.16b, v27.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 4
+
+ pmull2 v10.1q, v13.2d, v23.2d //GHASH block 8k+5 - high
+ eor v12.16b, v12.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
+ pmull v23.1q, v13.1d, v23.1d //GHASH block 8k+5 - low
+
+ aese v2.16b, v27.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 4
+ ldp q26, q27, [x8, #96] //load rk6, rk7
+ trn1 v13.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
+
+ pmull2 v16.1q, v12.2d, v24.2d //GHASH block 8k+4 - mid
+ pmull v24.1q, v12.1d, v24.1d //GHASH block 8k+5 - mid
+ pmull2 v11.1q, v14.2d, v22.2d //GHASH block 8k+6 - high
+
+ pmull2 v12.1q, v15.2d, v20.2d //GHASH block 8k+7 - high
+ aese v2.16b, v28.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 5
+ aese v5.16b, v28.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 5
+
+ pmull v22.1q, v14.1d, v22.1d //GHASH block 8k+6 - low
+.inst 0xce082a31 //eor3 v17.16b, v17.16b, v8.16b, v10.16b //GHASH block 8k+4, 8k+5 - high
+ trn2 v14.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
+
+.inst 0xce195e73 //eor3 v19.16b, v19.16b, v25.16b, v23.16b //GHASH block 8k+4, 8k+5 - low
+ aese v6.16b, v28.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 5
+
+ eor v14.16b, v14.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
+ aese v7.16b, v28.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 5
+ aese v1.16b, v28.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 5
+
+ aese v3.16b, v28.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 5
+ aese v4.16b, v28.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 5
+ aese v0.16b, v28.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 5
+
+.inst 0xce184252 //eor3 v18.16b, v18.16b, v24.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
+ ldr d16, [x10] //MODULO - load modulo constant
+ pmull v20.1q, v15.1d, v20.1d //GHASH block 8k+7 - low
+
+ aese v7.16b, v26.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 6
+ aese v5.16b, v26.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 6
+
+ pmull2 v13.1q, v14.2d, v21.2d //GHASH block 8k+6 - mid
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 6
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 6
+
+ pmull v21.1q, v14.1d, v21.1d //GHASH block 8k+7 - mid
+.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+6, 8k+7 - low
+ ldp q8, q9, [x0], #32 //AES block 8k+8, 8k+9 - load plaintext
+
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 6
+ rev32 v20.16b, v30.16b //CTR block 8k+16
+ add v30.4s, v30.4s, v31.4s //CTR block 8k+16
+
+ aese v4.16b, v26.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 6
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 6
+ aese v6.16b, v26.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 6
+
+.inst 0xce153652 //eor3 v18.16b, v18.16b, v21.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
+ ldp q28, q26, [x8, #128] //load rk8, rk9
+.inst 0xce0b3231 //eor3 v17.16b, v17.16b, v11.16b, v12.16b //GHASH block 8k+6, 8k+7 - high
+
+ aese v2.16b, v27.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 7
+ aese v7.16b, v27.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 7
+ ldp q10, q11, [x0], #32 //AES block 8k+10, 8k+11 - load plaintext
+
+ aese v5.16b, v27.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 7
+ aese v6.16b, v27.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 7
+ aese v1.16b, v27.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 7
+
+ pmull v21.1q, v17.1d, v16.1d //MODULO - top 64b align with mid
+ aese v0.16b, v27.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 7
+ aese v4.16b, v27.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 7
+
+ rev32 v22.16b, v30.16b //CTR block 8k+17
+ aese v3.16b, v27.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 7
+
+ aese v5.16b, v28.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 8
+ ldp q12, q13, [x0], #32 //AES block 8k+12, 8k+13 - load plaintext
+ add v30.4s, v30.4s, v31.4s //CTR block 8k+17
+
+ aese v2.16b, v28.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 8
+ aese v1.16b, v28.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 8
+ aese v7.16b, v28.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 8
+
+ aese v4.16b, v28.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 8
+.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up
+ ldr q27, [x8, #160] //load rk10
+
+ ext v29.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment
+ rev32 v23.16b, v30.16b //CTR block 8k+18
+ add v30.4s, v30.4s, v31.4s //CTR block 8k+18
+ aese v3.16b, v28.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 8
+
+ aese v0.16b, v28.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 8
+.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid
+ aese v6.16b, v28.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 8
+
+ aese v2.16b, v26.16b //AES block 8k+10 - round 9
+ aese v4.16b, v26.16b //AES block 8k+12 - round 9
+ aese v1.16b, v26.16b //AES block 8k+9 - round 9
+
+ ldp q14, q15, [x0], #32 //AES block 8k+14, 8k+15 - load plaintext
+ rev32 v25.16b, v30.16b //CTR block 8k+19
+ add v30.4s, v30.4s, v31.4s //CTR block 8k+19
+
+ cmp x0, x5 //.LOOP CONTROL
+.inst 0xce046d8c //eor3 v12.16b, v12.16b, v4.16b, v27.16b //AES block 4 - result
+ aese v7.16b, v26.16b //AES block 8k+15 - round 9
+
+ aese v6.16b, v26.16b //AES block 8k+14 - round 9
+ aese v3.16b, v26.16b //AES block 8k+11 - round 9
+
+.inst 0xce026d4a //eor3 v10.16b, v10.16b, v2.16b, v27.16b //AES block 8k+10 - result
+
+ mov v2.16b, v23.16b //CTR block 8k+18
+ aese v0.16b, v26.16b //AES block 8k+8 - round 9
+
+ rev32 v4.16b, v30.16b //CTR block 8k+20
+ add v30.4s, v30.4s, v31.4s //CTR block 8k+20
+
+.inst 0xce076def //eor3 v15.16b, v15.16b, v7.16b, v27.16b //AES block 7 - result
+ aese v5.16b, v26.16b //AES block 8k+13 - round 9
+ pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low
+
+.inst 0xce016d29 //eor3 v9.16b, v9.16b, v1.16b, v27.16b //AES block 8k+9 - result
+.inst 0xce036d6b //eor3 v11.16b, v11.16b, v3.16b, v27.16b //AES block 8k+11 - result
+ mov v3.16b, v25.16b //CTR block 8k+19
+
+ ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment
+.inst 0xce056dad //eor3 v13.16b, v13.16b, v5.16b, v27.16b //AES block 5 - result
+ mov v1.16b, v22.16b //CTR block 8k+17
+
+.inst 0xce006d08 //eor3 v8.16b, v8.16b, v0.16b, v27.16b //AES block 8k+8 - result
+ mov v0.16b, v20.16b //CTR block 8k+16
+ stp q8, q9, [x2], #32 //AES block 8k+8, 8k+9 - store result
+
+ stp q10, q11, [x2], #32 //AES block 8k+10, 8k+11 - store result
+.inst 0xce066dce //eor3 v14.16b, v14.16b, v6.16b, v27.16b //AES block 6 - result
+
+ stp q12, q13, [x2], #32 //AES block 8k+12, 8k+13 - store result
+.inst 0xce115673 //eor3 v19.16b, v19.16b, v17.16b, v21.16b //MODULO - fold into low
+
+ stp q14, q15, [x2], #32 //AES block 8k+14, 8k+15 - store result
+ b.lt .L128_enc_main_loop
+
+.L128_enc_prepretail: //PREPRETAIL
+ rev32 v5.16b, v30.16b //CTR block 8k+13
+ ldr q23, [x3, #176] //load h7l | h7h
+ ext v23.16b, v23.16b, v23.16b, #8
+ ldr q25, [x3, #208] //load h8l | h8h
+ ext v25.16b, v25.16b, v25.16b, #8
+ ext v19.16b, v19.16b, v19.16b, #8 //PRE 0
+
+ ldr q20, [x3, #128] //load h5l | h5h
+ ext v20.16b, v20.16b, v20.16b, #8
+ ldr q22, [x3, #160] //load h6l | h6h
+ ext v22.16b, v22.16b, v22.16b, #8
+ rev64 v8.16b, v8.16b //GHASH block 8k
+ rev64 v9.16b, v9.16b //GHASH block 8k+1
+
+ ldr q21, [x3, #144] //load h6k | h5k
+ ldr q24, [x3, #192] //load h6k | h5k
+ add v30.4s, v30.4s, v31.4s //CTR block 8k+13
+ rev64 v11.16b, v11.16b //GHASH block 8k+3
+
+ rev64 v10.16b, v10.16b //GHASH block 8k+2
+ eor v8.16b, v8.16b, v19.16b //PRE 1
+
+ rev32 v6.16b, v30.16b //CTR block 8k+14
+
+ pmull2 v16.1q, v9.2d, v23.2d //GHASH block 8k+1 - high
+ pmull v19.1q, v8.1d, v25.1d //GHASH block 8k - low
+ pmull2 v17.1q, v8.2d, v25.2d //GHASH block 8k - high
+
+ rev64 v13.16b, v13.16b //GHASH block 8k+5 (t0, t1, t2 and t3 free)
+ trn1 v18.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
+
+ pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low
+ eor v17.16b, v17.16b, v16.16b //GHASH block 8k+1 - high
+ trn2 v8.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
+
+ eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low
+ eor v8.16b, v8.16b, v18.16b //GHASH block 8k, 8k+1 - mid
+
+ ldp q26, q27, [x8, #0] //load rk0, rk1
+ add v30.4s, v30.4s, v31.4s //CTR block 8k+14
+
+ pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid
+ pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid
+
+ rev64 v12.16b, v12.16b //GHASH block 8k+4 (t0, t1, and t2 free)
+ rev64 v15.16b, v15.16b //GHASH block 8k+7 (t0, t1, t2 and t3 free)
+
+ eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid
+
+ rev32 v7.16b, v30.16b //CTR block 8k+15
+
+ rev64 v14.16b, v14.16b //GHASH block 8k+6 (t0, t1, and t2 free)
+
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 0
+
+ pmull2 v9.1q, v11.2d, v20.2d //GHASH block 8k+3 - high
+ pmull2 v29.1q, v10.2d, v22.2d //GHASH block 8k+2 - high
+
+ aese v6.16b, v26.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 0
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 0
+
+ pmull v22.1q, v10.1d, v22.1d //GHASH block 8k+2 - low
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 0
+
+.inst 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b, v9.16b //GHASH block 8k+2, 8k+3 - high
+ trn1 v29.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
+ trn2 v10.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
+
+ aese v5.16b, v26.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 0
+ aese v7.16b, v26.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 0
+
+ eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
+ aese v4.16b, v26.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 0
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 0
+
+ aese v3.16b, v27.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 1
+ pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low
+
+ ldr q23, [x3, #80] //load h3l | h3h
+ ext v23.16b, v23.16b, v23.16b, #8
+ ldr q25, [x3, #112] //load h4l | h4h
+ ext v25.16b, v25.16b, v25.16b, #8
+
+ ldp q28, q26, [x8, #32] //load rk2, rk3
+ aese v5.16b, v27.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 1
+ pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid
+
+.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low
+ pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid
+
+ aese v1.16b, v27.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 1
+ aese v0.16b, v27.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 1
+
+.inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
+ ldr q21, [x3, #48] //load h2k | h1k
+ ldr q24, [x3, #96] //load h4k | h3k
+ aese v2.16b, v27.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 1
+
+ aese v4.16b, v27.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 1
+ aese v7.16b, v27.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 1
+
+ aese v5.16b, v28.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 2
+ aese v2.16b, v28.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 2
+ aese v3.16b, v28.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 2
+
+ aese v1.16b, v28.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 2
+ aese v6.16b, v27.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 1
+ aese v4.16b, v28.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 2
+
+ aese v5.16b, v26.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 3
+ aese v0.16b, v28.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 2
+
+ aese v6.16b, v28.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 2
+ aese v7.16b, v28.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 2
+ ldp q27, q28, [x8, #64] //load rk4, rk5
+
+ ldr q20, [x3, #32] //load h1l | h1h
+ ext v20.16b, v20.16b, v20.16b, #8
+ ldr q22, [x3, #64] //load h1l | h1h
+ ext v22.16b, v22.16b, v22.16b, #8
+ trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 3
+
+ pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high
+ aese v6.16b, v26.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 3
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 3
+
+ pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low
+ trn2 v12.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
+ pmull2 v10.1q, v13.2d, v23.2d //GHASH block 8k+5 - high
+
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 3
+ add v30.4s, v30.4s, v31.4s //CTR block 8k+15
+
+ aese v7.16b, v26.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 3
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 3
+ eor v12.16b, v12.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
+
+ pmull v23.1q, v13.1d, v23.1d //GHASH block 8k+5 - low
+ aese v4.16b, v26.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 3
+ pmull2 v11.1q, v14.2d, v22.2d //GHASH block 8k+6 - high
+
+ trn1 v13.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
+ pmull v22.1q, v14.1d, v22.1d //GHASH block 8k+6 - low
+ trn2 v14.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
+
+ aese v1.16b, v27.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 4
+ aese v3.16b, v27.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 4
+.inst 0xce082a31 //eor3 v17.16b, v17.16b, v8.16b, v10.16b //GHASH block 8k+4, 8k+5 - high
+
+.inst 0xce195e73 //eor3 v19.16b, v19.16b, v25.16b, v23.16b //GHASH block 8k+4, 8k+5 - low
+ eor v14.16b, v14.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
+ pmull2 v16.1q, v12.2d, v24.2d //GHASH block 8k+4 - mid
+
+ aese v1.16b, v28.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 5
+ aese v6.16b, v27.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 4
+ aese v0.16b, v27.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 4
+
+ aese v7.16b, v27.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 4
+ aese v2.16b, v27.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 4
+
+ pmull v24.1q, v12.1d, v24.1d //GHASH block 8k+5 - mid
+ aese v4.16b, v27.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 4
+ aese v5.16b, v27.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 4
+
+ pmull2 v12.1q, v15.2d, v20.2d //GHASH block 8k+7 - high
+ ldp q26, q27, [x8, #96] //load rk6, rk7
+ pmull v20.1q, v15.1d, v20.1d //GHASH block 8k+7 - low
+
+.inst 0xce184252 //eor3 v18.16b, v18.16b, v24.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
+ pmull2 v13.1q, v14.2d, v21.2d //GHASH block 8k+6 - mid
+ pmull v21.1q, v14.1d, v21.1d //GHASH block 8k+7 - mid
+
+ aese v0.16b, v28.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 5
+ aese v7.16b, v28.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 5
+ ldr d16, [x10] //MODULO - load modulo constant
+
+ aese v2.16b, v28.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 5
+ aese v4.16b, v28.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 5
+
+.inst 0xce0b3231 //eor3 v17.16b, v17.16b, v11.16b, v12.16b //GHASH block 8k+6, 8k+7 - high
+ aese v5.16b, v28.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 5
+ aese v6.16b, v28.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 5
+
+ aese v3.16b, v28.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 5
+ aese v4.16b, v26.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 6
+
+ aese v5.16b, v26.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 6
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 6
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 6
+
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 6
+.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+6, 8k+7 - low
+.inst 0xce153652 //eor3 v18.16b, v18.16b, v21.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
+
+ aese v6.16b, v26.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 6
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 6
+ aese v7.16b, v26.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 6
+
+ pmull v21.1q, v17.1d, v16.1d //MODULO - top 64b align with mid
+.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up
+ ldp q28, q26, [x8, #128] //load rk8, rk9
+
+ aese v3.16b, v27.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 7
+ aese v6.16b, v27.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 7
+ aese v1.16b, v27.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 7
+ ext v29.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment
+
+ aese v5.16b, v27.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 7
+ aese v0.16b, v27.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 7
+.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid
+
+ aese v2.16b, v27.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 7
+ aese v7.16b, v27.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 7
+
+ pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low
+ aese v4.16b, v27.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 7
+
+ aese v7.16b, v28.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 8
+ aese v2.16b, v28.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 8
+ aese v1.16b, v28.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 8
+ ext v18.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment
+
+ aese v6.16b, v28.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 8
+.inst 0xce114a73 //eor3 v19.16b, v19.16b, v17.16b, v18.16b //MODULO - fold into low
+ aese v4.16b, v28.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 8
+
+ aese v3.16b, v28.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 8
+ aese v0.16b, v28.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 8
+ aese v5.16b, v28.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 8
+
+ ldr q27, [x8, #160] //load rk10
+ aese v6.16b, v26.16b //AES block 8k+14 - round 9
+ aese v2.16b, v26.16b //AES block 8k+10 - round 9
+
+ aese v0.16b, v26.16b //AES block 8k+8 - round 9
+ aese v1.16b, v26.16b //AES block 8k+9 - round 9
+
+ aese v3.16b, v26.16b //AES block 8k+11 - round 9
+ aese v5.16b, v26.16b //AES block 8k+13 - round 9
+
+ aese v4.16b, v26.16b //AES block 8k+12 - round 9
+ aese v7.16b, v26.16b //AES block 8k+15 - round 9
+.L128_enc_tail: //TAIL
+
+ sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process
+ ldr q8, [x0], #16 //AES block 8k+8 - load plaintext
+
+ mov v29.16b, v27.16b
+ ldp q20, q21, [x3, #128] //load h5l | h5h
+ ext v20.16b, v20.16b, v20.16b, #8
+
+.inst 0xce007509 //eor3 v9.16b, v8.16b, v0.16b, v29.16b //AES block 8k+8 - result
+ ext v16.16b, v19.16b, v19.16b, #8 //prepare final partial tag
+ ldp q22, q23, [x3, #160] //load h6l | h6h
+ ext v22.16b, v22.16b, v22.16b, #8
+ ext v23.16b, v23.16b, v23.16b, #8
+
+ ldp q24, q25, [x3, #192] //load h8k | h7k
+ ext v25.16b, v25.16b, v25.16b, #8
+ cmp x5, #112
+ b.gt .L128_enc_blocks_more_than_7
+
+ mov v7.16b, v6.16b
+ mov v6.16b, v5.16b
+ movi v17.8b, #0
+
+ cmp x5, #96
+ sub v30.4s, v30.4s, v31.4s
+ mov v5.16b, v4.16b
+
+ mov v4.16b, v3.16b
+ mov v3.16b, v2.16b
+ mov v2.16b, v1.16b
+
+ movi v19.8b, #0
+ movi v18.8b, #0
+ b.gt .L128_enc_blocks_more_than_6
+
+ mov v7.16b, v6.16b
+ cmp x5, #80
+
+ sub v30.4s, v30.4s, v31.4s
+ mov v6.16b, v5.16b
+ mov v5.16b, v4.16b
+
+ mov v4.16b, v3.16b
+ mov v3.16b, v1.16b
+ b.gt .L128_enc_blocks_more_than_5
+
+ cmp x5, #64
+ sub v30.4s, v30.4s, v31.4s
+
+ mov v7.16b, v6.16b
+ mov v6.16b, v5.16b
+
+ mov v5.16b, v4.16b
+ mov v4.16b, v1.16b
+ b.gt .L128_enc_blocks_more_than_4
+
+ mov v7.16b, v6.16b
+ sub v30.4s, v30.4s, v31.4s
+ mov v6.16b, v5.16b
+
+ mov v5.16b, v1.16b
+ cmp x5, #48
+ b.gt .L128_enc_blocks_more_than_3
+
+ sub v30.4s, v30.4s, v31.4s
+ mov v7.16b, v6.16b
+ mov v6.16b, v1.16b
+
+ cmp x5, #32
+ ldr q24, [x3, #96] //load h4k | h3k
+ b.gt .L128_enc_blocks_more_than_2
+
+ cmp x5, #16
+
+ sub v30.4s, v30.4s, v31.4s
+ mov v7.16b, v1.16b
+ b.gt .L128_enc_blocks_more_than_1
+
+ ldr q21, [x3, #48] //load h2k | h1k
+ sub v30.4s, v30.4s, v31.4s
+ b .L128_enc_blocks_less_than_1
+.L128_enc_blocks_more_than_7: //blocks left > 7
+ st1 { v9.16b}, [x2], #16 //AES final-7 block - store result
+
+ rev64 v8.16b, v9.16b //GHASH final-7 block
+ ldr q9, [x0], #16 //AES final-6 block - load plaintext
+
+ eor v8.16b, v8.16b, v16.16b //feed in partial tag
+
+ ins v27.d[0], v8.d[1] //GHASH final-7 block - mid
+
+ pmull2 v17.1q, v8.2d, v25.2d //GHASH final-7 block - high
+
+ ins v18.d[0], v24.d[1] //GHASH final-7 block - mid
+
+ eor v27.8b, v27.8b, v8.8b //GHASH final-7 block - mid
+ movi v16.8b, #0 //suppress further partial tag feed in
+
+.inst 0xce017529 //eor3 v9.16b, v9.16b, v1.16b, v29.16b //AES final-6 block - result
+
+ pmull v18.1q, v27.1d, v18.1d //GHASH final-7 block - mid
+ pmull v19.1q, v8.1d, v25.1d //GHASH final-7 block - low
+.L128_enc_blocks_more_than_6: //blocks left > 6
+
+ st1 { v9.16b}, [x2], #16 //AES final-6 block - store result
+
+ rev64 v8.16b, v9.16b //GHASH final-6 block
+ ldr q9, [x0], #16 //AES final-5 block - load plaintext
+
+ eor v8.16b, v8.16b, v16.16b //feed in partial tag
+
+ ins v27.d[0], v8.d[1] //GHASH final-6 block - mid
+
+.inst 0xce027529 //eor3 v9.16b, v9.16b, v2.16b, v29.16b //AES final-5 block - result
+ pmull v26.1q, v8.1d, v23.1d //GHASH final-6 block - low
+
+ eor v27.8b, v27.8b, v8.8b //GHASH final-6 block - mid
+ movi v16.8b, #0 //suppress further partial tag feed in
+
+ pmull v27.1q, v27.1d, v24.1d //GHASH final-6 block - mid
+ pmull2 v28.1q, v8.2d, v23.2d //GHASH final-6 block - high
+
+ eor v19.16b, v19.16b, v26.16b //GHASH final-6 block - low
+
+ eor v18.16b, v18.16b, v27.16b //GHASH final-6 block - mid
+ eor v17.16b, v17.16b, v28.16b //GHASH final-6 block - high
+.L128_enc_blocks_more_than_5: //blocks left > 5
+
+ st1 { v9.16b}, [x2], #16 //AES final-5 block - store result
+
+ rev64 v8.16b, v9.16b //GHASH final-5 block
+
+ eor v8.16b, v8.16b, v16.16b //feed in partial tag
+
+ ins v27.d[0], v8.d[1] //GHASH final-5 block - mid
+ ldr q9, [x0], #16 //AES final-4 block - load plaintext
+ pmull2 v28.1q, v8.2d, v22.2d //GHASH final-5 block - high
+
+ eor v17.16b, v17.16b, v28.16b //GHASH final-5 block - high
+
+ eor v27.8b, v27.8b, v8.8b //GHASH final-5 block - mid
+
+ ins v27.d[1], v27.d[0] //GHASH final-5 block - mid
+
+.inst 0xce037529 //eor3 v9.16b, v9.16b, v3.16b, v29.16b //AES final-4 block - result
+ pmull v26.1q, v8.1d, v22.1d //GHASH final-5 block - low
+ movi v16.8b, #0 //suppress further partial tag feed in
+
+ pmull2 v27.1q, v27.2d, v21.2d //GHASH final-5 block - mid
+ eor v19.16b, v19.16b, v26.16b //GHASH final-5 block - low
+
+ eor v18.16b, v18.16b, v27.16b //GHASH final-5 block - mid
+.L128_enc_blocks_more_than_4: //blocks left > 4
+
+ st1 { v9.16b}, [x2], #16 //AES final-4 block - store result
+
+ rev64 v8.16b, v9.16b //GHASH final-4 block
+
+ ldr q9, [x0], #16 //AES final-3 block - load plaintext
+
+ eor v8.16b, v8.16b, v16.16b //feed in partial tag
+
+ ins v27.d[0], v8.d[1] //GHASH final-4 block - mid
+ movi v16.8b, #0 //suppress further partial tag feed in
+ pmull2 v28.1q, v8.2d, v20.2d //GHASH final-4 block - high
+
+ eor v27.8b, v27.8b, v8.8b //GHASH final-4 block - mid
+
+ pmull v26.1q, v8.1d, v20.1d //GHASH final-4 block - low
+
+ eor v17.16b, v17.16b, v28.16b //GHASH final-4 block - high
+ pmull v27.1q, v27.1d, v21.1d //GHASH final-4 block - mid
+
+ eor v19.16b, v19.16b, v26.16b //GHASH final-4 block - low
+
+.inst 0xce047529 //eor3 v9.16b, v9.16b, v4.16b, v29.16b //AES final-3 block - result
+ eor v18.16b, v18.16b, v27.16b //GHASH final-4 block - mid
+.L128_enc_blocks_more_than_3: //blocks left > 3
+
+ st1 { v9.16b}, [x2], #16 //AES final-3 block - store result
+
+ ldr q25, [x3, #112] //load h4l | h4h
+ ext v25.16b, v25.16b, v25.16b, #8
+
+ rev64 v8.16b, v9.16b //GHASH final-3 block
+
+ eor v8.16b, v8.16b, v16.16b //feed in partial tag
+ movi v16.8b, #0 //suppress further partial tag feed in
+
+ ins v27.d[0], v8.d[1] //GHASH final-3 block - mid
+ ldr q24, [x3, #96] //load h4k | h3k
+ pmull v26.1q, v8.1d, v25.1d //GHASH final-3 block - low
+
+ ldr q9, [x0], #16 //AES final-2 block - load plaintext
+
+ eor v27.8b, v27.8b, v8.8b //GHASH final-3 block - mid
+
+ ins v27.d[1], v27.d[0] //GHASH final-3 block - mid
+ eor v19.16b, v19.16b, v26.16b //GHASH final-3 block - low
+
+.inst 0xce057529 //eor3 v9.16b, v9.16b, v5.16b, v29.16b //AES final-2 block - result
+
+ pmull2 v27.1q, v27.2d, v24.2d //GHASH final-3 block - mid
+ pmull2 v28.1q, v8.2d, v25.2d //GHASH final-3 block - high
+
+ eor v18.16b, v18.16b, v27.16b //GHASH final-3 block - mid
+ eor v17.16b, v17.16b, v28.16b //GHASH final-3 block - high
+.L128_enc_blocks_more_than_2: //blocks left > 2
+
+ st1 { v9.16b}, [x2], #16 //AES final-2 block - store result
+
+ rev64 v8.16b, v9.16b //GHASH final-2 block
+
+ eor v8.16b, v8.16b, v16.16b //feed in partial tag
+
+ ldr q9, [x0], #16 //AES final-1 block - load plaintext
+
+ ins v27.d[0], v8.d[1] //GHASH final-2 block - mid
+ ldr q23, [x3, #80] //load h3l | h3h
+ ext v23.16b, v23.16b, v23.16b, #8
+ movi v16.8b, #0 //suppress further partial tag feed in
+
+ eor v27.8b, v27.8b, v8.8b //GHASH final-2 block - mid
+.inst 0xce067529 //eor3 v9.16b, v9.16b, v6.16b, v29.16b //AES final-1 block - result
+
+ pmull2 v28.1q, v8.2d, v23.2d //GHASH final-2 block - high
+
+ pmull v26.1q, v8.1d, v23.1d //GHASH final-2 block - low
+ pmull v27.1q, v27.1d, v24.1d //GHASH final-2 block - mid
+
+ eor v17.16b, v17.16b, v28.16b //GHASH final-2 block - high
+
+ eor v18.16b, v18.16b, v27.16b //GHASH final-2 block - mid
+ eor v19.16b, v19.16b, v26.16b //GHASH final-2 block - low
+.L128_enc_blocks_more_than_1: //blocks left > 1
+
+ st1 { v9.16b}, [x2], #16 //AES final-1 block - store result
+
+ ldr q22, [x3, #64] //load h2l | h2h
+ ext v22.16b, v22.16b, v22.16b, #8
+ rev64 v8.16b, v9.16b //GHASH final-1 block
+ ldr q9, [x0], #16 //AES final block - load plaintext
+
+ eor v8.16b, v8.16b, v16.16b //feed in partial tag
+
+ movi v16.8b, #0 //suppress further partial tag feed in
+ ins v27.d[0], v8.d[1] //GHASH final-1 block - mid
+.inst 0xce077529 //eor3 v9.16b, v9.16b, v7.16b, v29.16b //AES final block - result
+
+ pmull2 v28.1q, v8.2d, v22.2d //GHASH final-1 block - high
+
+ eor v27.8b, v27.8b, v8.8b //GHASH final-1 block - mid
+
+ ldr q21, [x3, #48] //load h2k | h1k
+
+ ins v27.d[1], v27.d[0] //GHASH final-1 block - mid
+
+ pmull v26.1q, v8.1d, v22.1d //GHASH final-1 block - low
+ pmull2 v27.1q, v27.2d, v21.2d //GHASH final-1 block - mid
+
+ eor v17.16b, v17.16b, v28.16b //GHASH final-1 block - high
+
+ eor v18.16b, v18.16b, v27.16b //GHASH final-1 block - mid
+ eor v19.16b, v19.16b, v26.16b //GHASH final-1 block - low
+.L128_enc_blocks_less_than_1: //blocks left <= 1
+
+ rev32 v30.16b, v30.16b
+ str q30, [x16] //store the updated counter
+ and x1, x1, #127 //bit_length %= 128
+
+ sub x1, x1, #128 //bit_length -= 128
+
+ neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128])
+
+ mvn x6, xzr //temp0_x = 0xffffffffffffffff
+ ld1 { v26.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored
+ and x1, x1, #127 //bit_length %= 128
+
+ lsr x6, x6, x1 //temp0_x is mask for top 64b of last block
+ mvn x7, xzr //temp1_x = 0xffffffffffffffff
+ cmp x1, #64
+
+ csel x13, x7, x6, lt
+ csel x14, x6, xzr, lt
+
+ mov v0.d[1], x14
+ mov v0.d[0], x13 //ctr0b is mask for last block
+
+ and v9.16b, v9.16b, v0.16b //possibly partial last block has zeroes in highest bits
+
+ rev64 v8.16b, v9.16b //GHASH final block
+
+ bif v9.16b, v26.16b, v0.16b //insert existing bytes in top end of result before storing
+ st1 { v9.16b}, [x2] //store all 16B
+
+ eor v8.16b, v8.16b, v16.16b //feed in partial tag
+
+ ins v16.d[0], v8.d[1] //GHASH final block - mid
+
+ eor v16.8b, v16.8b, v8.8b //GHASH final block - mid
+ ldr q20, [x3, #32] //load h1l | h1h
+ ext v20.16b, v20.16b, v20.16b, #8
+
+ pmull v16.1q, v16.1d, v21.1d //GHASH final block - mid
+
+ pmull2 v28.1q, v8.2d, v20.2d //GHASH final block - high
+ eor v18.16b, v18.16b, v16.16b //GHASH final block - mid
+ ldr d16, [x10] //MODULO - load modulo constant
+
+ pmull v26.1q, v8.1d, v20.1d //GHASH final block - low
+
+ eor v17.16b, v17.16b, v28.16b //GHASH final block - high
+
+ eor v19.16b, v19.16b, v26.16b //GHASH final block - low
+
+ ext v21.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment
+ pmull v29.1q, v17.1d, v16.1d //MODULO - top 64b align with mid
+
+.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up
+
+.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid
+
+ pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low
+ ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment
+
+.inst 0xce115673 //eor3 v19.16b, v19.16b, v17.16b, v21.16b //MODULO - fold into low
+ ext v19.16b, v19.16b, v19.16b, #8
+ rev64 v19.16b, v19.16b
+ st1 { v19.16b }, [x3]
+ mov x0, x9
+
+ ldp d10, d11, [sp, #16]
+ ldp d12, d13, [sp, #32]
+ ldp d14, d15, [sp, #48]
+ ldp d8, d9, [sp], #80
+ ret
+
+.L128_enc_ret:
+ mov w0, #0x0
+ ret
+.size unroll8_eor3_aes_gcm_enc_128_kernel,.-unroll8_eor3_aes_gcm_enc_128_kernel
+.globl unroll8_eor3_aes_gcm_dec_128_kernel
+.type unroll8_eor3_aes_gcm_dec_128_kernel,%function
+.align 4
+unroll8_eor3_aes_gcm_dec_128_kernel:
+ AARCH64_VALID_CALL_TARGET
+ cbz x1, .L128_dec_ret
+ stp d8, d9, [sp, #-80]!
+ lsr x9, x1, #3
+ mov x16, x4
+ mov x8, x5
+ stp d10, d11, [sp, #16]
+ stp d12, d13, [sp, #32]
+ stp d14, d15, [sp, #48]
+ mov x5, #0xc200000000000000
+ stp x5, xzr, [sp, #64]
+ add x10, sp, #64
+
+ mov x5, x9
+ ld1 { v0.16b}, [x16] //CTR block 0
+
+ ldp q26, q27, [x8, #0] //load rk0, rk1
+ sub x5, x5, #1 //byte_len - 1
+
+ mov x15, #0x100000000 //set up counter increment
+ movi v31.16b, #0x0
+ mov v31.d[1], x15
+ ld1 { v19.16b}, [x3]
+ ext v19.16b, v19.16b, v19.16b, #8
+ rev64 v19.16b, v19.16b
+
+ rev32 v30.16b, v0.16b //set up reversed counter
+
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b //AES block 0 - round 0
+
+ add v30.4s, v30.4s, v31.4s //CTR block 0
+
+ rev32 v1.16b, v30.16b //CTR block 1
+ add v30.4s, v30.4s, v31.4s //CTR block 1
+
+ and x5, x5, #0xffffffffffffff80 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
+
+ rev32 v2.16b, v30.16b //CTR block 2
+ add v30.4s, v30.4s, v31.4s //CTR block 2
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b //AES block 1 - round 0
+
+ rev32 v3.16b, v30.16b //CTR block 3
+ add v30.4s, v30.4s, v31.4s //CTR block 3
+
+ aese v0.16b, v27.16b
+ aesmc v0.16b, v0.16b //AES block 0 - round 1
+ aese v1.16b, v27.16b
+ aesmc v1.16b, v1.16b //AES block 1 - round 1
+
+ rev32 v4.16b, v30.16b //CTR block 4
+ add v30.4s, v30.4s, v31.4s //CTR block 4
+
+ rev32 v5.16b, v30.16b //CTR block 5
+ add v30.4s, v30.4s, v31.4s //CTR block 5
+
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b //AES block 2 - round 0
+
+ rev32 v6.16b, v30.16b //CTR block 6
+ add v30.4s, v30.4s, v31.4s //CTR block 6
+ aese v5.16b, v26.16b
+ aesmc v5.16b, v5.16b //AES block 5 - round 0
+
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b //AES block 3 - round 0
+ aese v4.16b, v26.16b
+ aesmc v4.16b, v4.16b //AES block 4 - round 0
+
+ rev32 v7.16b, v30.16b //CTR block 7
+
+ aese v6.16b, v26.16b
+ aesmc v6.16b, v6.16b //AES block 6 - round 0
+ aese v2.16b, v27.16b
+ aesmc v2.16b, v2.16b //AES block 2 - round 1
+
+ aese v7.16b, v26.16b
+ aesmc v7.16b, v7.16b //AES block 7 - round 0
+
+ ldp q28, q26, [x8, #32] //load rk2, rk3
+
+ aese v6.16b, v27.16b
+ aesmc v6.16b, v6.16b //AES block 6 - round 1
+ aese v5.16b, v27.16b
+ aesmc v5.16b, v5.16b //AES block 5 - round 1
+
+ aese v4.16b, v27.16b
+ aesmc v4.16b, v4.16b //AES block 4 - round 1
+ aese v7.16b, v27.16b
+ aesmc v7.16b, v7.16b //AES block 7 - round 1
+
+ aese v7.16b, v28.16b
+ aesmc v7.16b, v7.16b //AES block 7 - round 2
+ aese v0.16b, v28.16b
+ aesmc v0.16b, v0.16b //AES block 0 - round 2
+ aese v3.16b, v27.16b
+ aesmc v3.16b, v3.16b //AES block 3 - round 1
+
+ aese v6.16b, v28.16b
+ aesmc v6.16b, v6.16b //AES block 6 - round 2
+ aese v2.16b, v28.16b
+ aesmc v2.16b, v2.16b //AES block 2 - round 2
+ aese v5.16b, v28.16b
+ aesmc v5.16b, v5.16b //AES block 5 - round 2
+
+ aese v4.16b, v28.16b
+ aesmc v4.16b, v4.16b //AES block 4 - round 2
+ aese v3.16b, v28.16b
+ aesmc v3.16b, v3.16b //AES block 3 - round 2
+ aese v1.16b, v28.16b
+ aesmc v1.16b, v1.16b //AES block 1 - round 2
+
+ aese v6.16b, v26.16b
+ aesmc v6.16b, v6.16b //AES block 6 - round 3
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b //AES block 2 - round 3
+
+ ldp q27, q28, [x8, #64] //load rk4, rk5
+ aese v5.16b, v26.16b
+ aesmc v5.16b, v5.16b //AES block 5 - round 3
+
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b //AES block 0 - round 3
+ aese v7.16b, v26.16b
+ aesmc v7.16b, v7.16b //AES block 7 - round 3
+
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b //AES block 3 - round 3
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b //AES block 1 - round 3
+
+ aese v0.16b, v27.16b
+ aesmc v0.16b, v0.16b //AES block 0 - round 4
+ aese v7.16b, v27.16b
+ aesmc v7.16b, v7.16b //AES block 7 - round 4
+ aese v4.16b, v26.16b
+ aesmc v4.16b, v4.16b //AES block 4 - round 3
+
+ aese v6.16b, v27.16b
+ aesmc v6.16b, v6.16b //AES block 6 - round 4
+ aese v1.16b, v27.16b
+ aesmc v1.16b, v1.16b //AES block 1 - round 4
+ aese v3.16b, v27.16b
+ aesmc v3.16b, v3.16b //AES block 3 - round 4
+
+ aese v5.16b, v27.16b
+ aesmc v5.16b, v5.16b //AES block 5 - round 4
+ aese v4.16b, v27.16b
+ aesmc v4.16b, v4.16b //AES block 4 - round 4
+ aese v2.16b, v27.16b
+ aesmc v2.16b, v2.16b //AES block 2 - round 4
+
+ ldp q26, q27, [x8, #96] //load rk6, rk7
+ aese v2.16b, v28.16b
+ aesmc v2.16b, v2.16b //AES block 2 - round 5
+ aese v3.16b, v28.16b
+ aesmc v3.16b, v3.16b //AES block 3 - round 5
+
+ aese v6.16b, v28.16b
+ aesmc v6.16b, v6.16b //AES block 6 - round 5
+ aese v1.16b, v28.16b
+ aesmc v1.16b, v1.16b //AES block 1 - round 5
+
+ aese v7.16b, v28.16b
+ aesmc v7.16b, v7.16b //AES block 7 - round 5
+ aese v5.16b, v28.16b
+ aesmc v5.16b, v5.16b //AES block 5 - round 5
+
+ aese v4.16b, v28.16b
+ aesmc v4.16b, v4.16b //AES block 4 - round 5
+
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b //AES block 3 - round 6
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b //AES block 2 - round 6
+ aese v0.16b, v28.16b
+ aesmc v0.16b, v0.16b //AES block 0 - round 5
+
+ aese v5.16b, v26.16b
+ aesmc v5.16b, v5.16b //AES block 5 - round 6
+ aese v4.16b, v26.16b
+ aesmc v4.16b, v4.16b //AES block 4 - round 6
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b //AES block 1 - round 6
+
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b //AES block 0 - round 6
+ aese v7.16b, v26.16b
+ aesmc v7.16b, v7.16b //AES block 7 - round 6
+ aese v6.16b, v26.16b
+ aesmc v6.16b, v6.16b //AES block 6 - round 6
+
+ aese v3.16b, v27.16b
+ aesmc v3.16b, v3.16b //AES block 3 - round 7
+ aese v4.16b, v27.16b
+ aesmc v4.16b, v4.16b //AES block 4 - round 7
+ aese v1.16b, v27.16b
+ aesmc v1.16b, v1.16b //AES block 1 - round 7
+
+ aese v7.16b, v27.16b
+ aesmc v7.16b, v7.16b //AES block 7 - round 7
+ aese v5.16b, v27.16b
+ aesmc v5.16b, v5.16b //AES block 5 - round 7
+ ldp q28, q26, [x8, #128] //load rk8, rk9
+
+ aese v6.16b, v27.16b
+ aesmc v6.16b, v6.16b //AES block 6 - round 7
+ aese v2.16b, v27.16b
+ aesmc v2.16b, v2.16b //AES block 2 - round 7
+ aese v0.16b, v27.16b
+ aesmc v0.16b, v0.16b //AES block 0 - round 7
+
+ add x5, x5, x0
+ add v30.4s, v30.4s, v31.4s //CTR block 7
+
+ aese v6.16b, v28.16b
+ aesmc v6.16b, v6.16b //AES block 6 - round 8
+ aese v0.16b, v28.16b
+ aesmc v0.16b, v0.16b //AES block 0 - round 8
+
+ aese v1.16b, v28.16b
+ aesmc v1.16b, v1.16b //AES block 1 - round 8
+ aese v7.16b, v28.16b
+ aesmc v7.16b, v7.16b //AES block 7 - round 8
+ aese v3.16b, v28.16b
+ aesmc v3.16b, v3.16b //AES block 3 - round 8
+
+ aese v5.16b, v28.16b
+ aesmc v5.16b, v5.16b //AES block 5 - round 8
+ aese v2.16b, v28.16b
+ aesmc v2.16b, v2.16b //AES block 2 - round 8
+ aese v4.16b, v28.16b
+ aesmc v4.16b, v4.16b //AES block 4 - round 8
+
+ aese v0.16b, v26.16b //AES block 0 - round 9
+ aese v1.16b, v26.16b //AES block 1 - round 9
+ aese v6.16b, v26.16b //AES block 6 - round 9
+
+ ldr q27, [x8, #160] //load rk10
+ aese v4.16b, v26.16b //AES block 4 - round 9
+ aese v3.16b, v26.16b //AES block 3 - round 9
+
+ aese v2.16b, v26.16b //AES block 2 - round 9
+ aese v5.16b, v26.16b //AES block 5 - round 9
+ aese v7.16b, v26.16b //AES block 7 - round 9
+
+ add x4, x0, x1, lsr #3 //end_input_ptr
+ cmp x0, x5 //check if we have <= 8 blocks
+ b.ge .L128_dec_tail //handle tail
+
+ ldp q8, q9, [x0], #32 //AES block 0, 1 - load ciphertext
+
+.inst 0xce006d00 //eor3 v0.16b, v8.16b, v0.16b, v27.16b //AES block 0 - result
+.inst 0xce016d21 //eor3 v1.16b, v9.16b, v1.16b, v27.16b //AES block 1 - result
+ stp q0, q1, [x2], #32 //AES block 0, 1 - store result
+
+ rev32 v0.16b, v30.16b //CTR block 8
+ add v30.4s, v30.4s, v31.4s //CTR block 8
+ ldp q10, q11, [x0], #32 //AES block 2, 3 - load ciphertext
+
+ ldp q12, q13, [x0], #32 //AES block 4, 5 - load ciphertext
+
+ rev32 v1.16b, v30.16b //CTR block 9
+ add v30.4s, v30.4s, v31.4s //CTR block 9
+ ldp q14, q15, [x0], #32 //AES block 6, 7 - load ciphertext
+
+.inst 0xce036d63 //eor3 v3.16b, v11.16b, v3.16b, v27.16b //AES block 3 - result
+.inst 0xce026d42 //eor3 v2.16b, v10.16b, v2.16b, v27.16b //AES block 2 - result
+ stp q2, q3, [x2], #32 //AES block 2, 3 - store result
+
+ rev32 v2.16b, v30.16b //CTR block 10
+ add v30.4s, v30.4s, v31.4s //CTR block 10
+
+.inst 0xce066dc6 //eor3 v6.16b, v14.16b, v6.16b, v27.16b //AES block 6 - result
+
+ rev32 v3.16b, v30.16b //CTR block 11
+ add v30.4s, v30.4s, v31.4s //CTR block 11
+
+.inst 0xce046d84 //eor3 v4.16b, v12.16b, v4.16b, v27.16b //AES block 4 - result
+.inst 0xce056da5 //eor3 v5.16b, v13.16b, v5.16b, v27.16b //AES block 5 - result
+ stp q4, q5, [x2], #32 //AES block 4, 5 - store result
+
+.inst 0xce076de7 //eor3 v7.16b, v15.16b, v7.16b, v27.16b //AES block 7 - result
+ stp q6, q7, [x2], #32 //AES block 6, 7 - store result
+ rev32 v4.16b, v30.16b //CTR block 12
+
+ cmp x0, x5 //check if we have <= 8 blocks
+ add v30.4s, v30.4s, v31.4s //CTR block 12
+ b.ge .L128_dec_prepretail //do prepretail
+
+.L128_dec_main_loop: //main loop start
+ ldr q23, [x3, #176] //load h7l | h7h
+ ext v23.16b, v23.16b, v23.16b, #8
+ ldr q25, [x3, #208] //load h8l | h8h
+ ext v25.16b, v25.16b, v25.16b, #8
+
+ rev64 v9.16b, v9.16b //GHASH block 8k+1
+ rev64 v8.16b, v8.16b //GHASH block 8k
+ ext v19.16b, v19.16b, v19.16b, #8 //PRE 0
+
+ rev64 v14.16b, v14.16b //GHASH block 8k+6
+ ldr q20, [x3, #128] //load h5l | h5h
+ ext v20.16b, v20.16b, v20.16b, #8
+ ldr q22, [x3, #160] //load h6l | h6h
+ ext v22.16b, v22.16b, v22.16b, #8
+
+ eor v8.16b, v8.16b, v19.16b //PRE 1
+ rev32 v5.16b, v30.16b //CTR block 8k+13
+ add v30.4s, v30.4s, v31.4s //CTR block 8k+13
+
+ rev64 v10.16b, v10.16b //GHASH block 8k+2
+ rev64 v12.16b, v12.16b //GHASH block 8k+4
+ ldp q26, q27, [x8, #0] //load rk0, rk1
+
+ rev32 v6.16b, v30.16b //CTR block 8k+14
+ add v30.4s, v30.4s, v31.4s //CTR block 8k+14
+ ldr q21, [x3, #144] //load h6k | h5k
+ ldr q24, [x3, #192] //load h8k | h7k
+
+ pmull2 v16.1q, v9.2d, v23.2d //GHASH block 8k+1 - high
+ pmull2 v17.1q, v8.2d, v25.2d //GHASH block 8k - high
+ rev64 v11.16b, v11.16b //GHASH block 8k+3
+
+ rev32 v7.16b, v30.16b //CTR block 8k+15
+ trn1 v18.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
+ rev64 v13.16b, v13.16b //GHASH block 8k+5
+
+ pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low
+ pmull v19.1q, v8.1d, v25.1d //GHASH block 8k - low
+ trn2 v8.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
+
+ pmull2 v29.1q, v10.2d, v22.2d //GHASH block 8k+2 - high
+ aese v4.16b, v26.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 0
+ pmull2 v9.1q, v11.2d, v20.2d //GHASH block 8k+3 - high
+
+ aese v6.16b, v26.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 0
+ aese v5.16b, v26.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 0
+ aese v7.16b, v26.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 0
+
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 0
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 0
+ eor v17.16b, v17.16b, v16.16b //GHASH block 8k+1 - high
+
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 0
+ eor v8.16b, v8.16b, v18.16b //GHASH block 8k, 8k+1 - mid
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 0
+
+ aese v2.16b, v27.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 1
+ eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low
+.inst 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b, v9.16b //GHASH block 8k+2, 8k+3 - high
+
+ ldp q28, q26, [x8, #32] //load rk2, rk3
+ trn1 v29.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
+ aese v7.16b, v27.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 1
+
+ pmull v22.1q, v10.1d, v22.1d //GHASH block 8k+2 - low
+ trn2 v10.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
+ pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid
+
+ ldr q23, [x3, #80] //load h3l | h3h
+ ext v23.16b, v23.16b, v23.16b, #8
+ ldr q25, [x3, #112] //load h4l | h4h
+ ext v25.16b, v25.16b, v25.16b, #8
+ pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid
+ aese v6.16b, v27.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 1
+
+ aese v4.16b, v27.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 1
+ aese v5.16b, v27.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 1
+ pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low
+
+ aese v3.16b, v27.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 1
+ aese v0.16b, v27.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 1
+ aese v1.16b, v27.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 1
+
+ aese v7.16b, v28.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 2
+ aese v2.16b, v28.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 2
+.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low
+
+ aese v4.16b, v28.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 2
+ eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
+ ldr q20, [x3, #32] //load h1l | h1h
+ ext v20.16b, v20.16b, v20.16b, #8
+ ldr q22, [x3, #64] //load h2l | h2h
+ ext v22.16b, v22.16b, v22.16b, #8
+
+ eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid
+ aese v1.16b, v28.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 2
+ aese v3.16b, v28.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 2
+
+ trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
+ aese v5.16b, v28.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 2
+ aese v0.16b, v28.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 2
+
+ aese v6.16b, v28.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 2
+ pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid
+ pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid
+
+ aese v7.16b, v26.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 3
+ rev64 v15.16b, v15.16b //GHASH block 8k+7
+ pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high
+
+ ldp q27, q28, [x8, #64] //load rk4, rk5
+ pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low
+.inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
+
+ ldr q21, [x3, #48] //load h2k | h1k
+ ldr q24, [x3, #96] //load h4k | h3k
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 3
+ trn2 v12.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
+
+ aese v4.16b, v26.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 3
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 3
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 3
+
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 3
+ aese v6.16b, v26.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 3
+ aese v5.16b, v26.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 3
+
+ pmull2 v10.1q, v13.2d, v23.2d //GHASH block 8k+5 - high
+ pmull v23.1q, v13.1d, v23.1d //GHASH block 8k+5 - low
+ pmull2 v11.1q, v14.2d, v22.2d //GHASH block 8k+6 - high
+
+ pmull v22.1q, v14.1d, v22.1d //GHASH block 8k+6 - low
+ aese v0.16b, v27.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 4
+ aese v7.16b, v27.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 4
+
+ eor v12.16b, v12.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
+ trn1 v13.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
+ aese v3.16b, v27.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 4
+
+ aese v1.16b, v27.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 4
+ aese v5.16b, v27.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 4
+ aese v6.16b, v27.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 4
+
+ aese v2.16b, v27.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 4
+ aese v4.16b, v27.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 4
+ trn2 v14.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
+
+ ldp q26, q27, [x8, #96] //load rk6, rk7
+ aese v0.16b, v28.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 5
+ pmull2 v16.1q, v12.2d, v24.2d //GHASH block 8k+4 - mid
+
+ aese v2.16b, v28.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 5
+ eor v14.16b, v14.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
+ aese v1.16b, v28.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 5
+
+ pmull v24.1q, v12.1d, v24.1d //GHASH block 8k+5 - mid
+ aese v6.16b, v28.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 5
+ aese v7.16b, v28.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 5
+
+ aese v3.16b, v28.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 5
+ aese v5.16b, v28.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 5
+ aese v4.16b, v28.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 5
+
+ pmull2 v12.1q, v15.2d, v20.2d //GHASH block 8k+7 - high
+.inst 0xce184252 //eor3 v18.16b, v18.16b, v24.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
+.inst 0xce195e73 //eor3 v19.16b, v19.16b, v25.16b, v23.16b //GHASH block 8k+4, 8k+5 - low
+
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 6
+.inst 0xce082a31 //eor3 v17.16b, v17.16b, v8.16b, v10.16b //GHASH block 8k+4, 8k+5 - high
+ aese v7.16b, v26.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 6
+
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 6
+ pmull2 v13.1q, v14.2d, v21.2d //GHASH block 8k+6 - mid
+ aese v6.16b, v26.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 6
+
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 6
+ aese v5.16b, v26.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 6
+ pmull v20.1q, v15.1d, v20.1d //GHASH block 8k+7 - low
+
+ pmull v21.1q, v14.1d, v21.1d //GHASH block 8k+7 - mid
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 6
+ add v30.4s, v30.4s, v31.4s //CTR block 8k+15
+
+.inst 0xce0b3231 //eor3 v17.16b, v17.16b, v11.16b, v12.16b //GHASH block 8k+6, 8k+7 - high
+ aese v4.16b, v26.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 6
+ ldp q28, q26, [x8, #128] //load rk8, rk9
+
+ ldr d16, [x10] //MODULO - load modulo constant
+.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+6, 8k+7 - low
+ aese v5.16b, v27.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 7
+
+ rev32 v20.16b, v30.16b //CTR block 8k+16
+.inst 0xce153652 //eor3 v18.16b, v18.16b, v21.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
+ add v30.4s, v30.4s, v31.4s //CTR block 8k+16
+
+ aese v6.16b, v27.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 7
+ aese v3.16b, v27.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 7
+ aese v7.16b, v27.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 7
+
+ aese v2.16b, v27.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 7
+ aese v1.16b, v27.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 7
+ rev32 v22.16b, v30.16b //CTR block 8k+17
+
+ aese v4.16b, v27.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 7
+ ext v21.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment
+ pmull v29.1q, v17.1d, v16.1d //MODULO - top 64b align with mid
+
+.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up
+ aese v0.16b, v27.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 7
+ add v30.4s, v30.4s, v31.4s //CTR block 8k+17
+
+ aese v5.16b, v28.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 8
+ aese v1.16b, v28.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 8
+ ldp q8, q9, [x0], #32 //AES block 8k+8, 8k+9 - load ciphertext
+
+ ldp q10, q11, [x0], #32 //AES block 8k+10, 8k+11 - load ciphertext
+ aese v0.16b, v28.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 8
+ rev32 v23.16b, v30.16b //CTR block 8k+18
+
+ ldp q12, q13, [x0], #32 //AES block 8k+12, 8k+13 - load ciphertext
+ aese v4.16b, v28.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 8
+.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid
+
+ ldp q14, q15, [x0], #32 //AES block 8k+14, 8k+15 - load ciphertext
+ aese v3.16b, v28.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 8
+ add v30.4s, v30.4s, v31.4s //CTR block 8k+18
+
+ aese v7.16b, v28.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 8
+ aese v2.16b, v28.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 8
+ aese v6.16b, v28.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 8
+
+ aese v0.16b, v26.16b //AES block 8k+8 - round 9
+ aese v1.16b, v26.16b //AES block 8k+9 - round 9
+ ldr q27, [x8, #160] //load rk10
+
+ aese v6.16b, v26.16b //AES block 8k+14 - round 9
+ pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low
+ aese v2.16b, v26.16b //AES block 8k+10 - round 9
+
+ aese v7.16b, v26.16b //AES block 8k+15 - round 9
+ aese v4.16b, v26.16b //AES block 8k+12 - round 9
+ ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment
+
+ rev32 v25.16b, v30.16b //CTR block 8k+19
+ add v30.4s, v30.4s, v31.4s //CTR block 8k+19
+
+ aese v3.16b, v26.16b //AES block 8k+11 - round 9
+ aese v5.16b, v26.16b //AES block 8k+13 - round 9
+.inst 0xce016d21 //eor3 v1.16b, v9.16b, v1.16b, v27.16b //AES block 8k+9 - result
+
+.inst 0xce006d00 //eor3 v0.16b, v8.16b, v0.16b, v27.16b //AES block 8k+8 - result
+.inst 0xce076de7 //eor3 v7.16b, v15.16b, v7.16b, v27.16b //AES block 8k+15 - result
+.inst 0xce066dc6 //eor3 v6.16b, v14.16b, v6.16b, v27.16b //AES block 8k+14 - result
+
+.inst 0xce026d42 //eor3 v2.16b, v10.16b, v2.16b, v27.16b //AES block 8k+10 - result
+ stp q0, q1, [x2], #32 //AES block 8k+8, 8k+9 - store result
+ mov v1.16b, v22.16b //CTR block 8k+17
+
+.inst 0xce046d84 //eor3 v4.16b, v12.16b, v4.16b, v27.16b //AES block 8k+12 - result
+.inst 0xce115673 //eor3 v19.16b, v19.16b, v17.16b, v21.16b //MODULO - fold into low
+ mov v0.16b, v20.16b //CTR block 8k+16
+
+.inst 0xce036d63 //eor3 v3.16b, v11.16b, v3.16b, v27.16b //AES block 8k+11 - result
+ cmp x0, x5 //.LOOP CONTROL
+ stp q2, q3, [x2], #32 //AES block 8k+10, 8k+11 - store result
+
+.inst 0xce056da5 //eor3 v5.16b, v13.16b, v5.16b, v27.16b //AES block 8k+13 - result
+ mov v2.16b, v23.16b //CTR block 8k+18
+
+ stp q4, q5, [x2], #32 //AES block 8k+12, 8k+13 - store result
+ rev32 v4.16b, v30.16b //CTR block 8k+20
+ add v30.4s, v30.4s, v31.4s //CTR block 8k+20
+
+ stp q6, q7, [x2], #32 //AES block 8k+14, 8k+15 - store result
+ mov v3.16b, v25.16b //CTR block 8k+19
+ b.lt .L128_dec_main_loop
+
+.L128_dec_prepretail: //PREPRETAIL
+ rev64 v11.16b, v11.16b //GHASH block 8k+3
+ ext v19.16b, v19.16b, v19.16b, #8 //PRE 0
+ rev64 v8.16b, v8.16b //GHASH block 8k
+
+ rev64 v10.16b, v10.16b //GHASH block 8k+2
+ rev32 v5.16b, v30.16b //CTR block 8k+13
+ ldp q26, q27, [x8, #0] //load rk0, rk1
+
+ ldr q23, [x3, #176] //load h7l | h7h
+ ext v23.16b, v23.16b, v23.16b, #8
+ ldr q25, [x3, #208] //load h8l | h8h
+ ext v25.16b, v25.16b, v25.16b, #8
+ eor v8.16b, v8.16b, v19.16b //PRE 1
+ rev64 v9.16b, v9.16b //GHASH block 8k+1
+
+ add v30.4s, v30.4s, v31.4s //CTR block 8k+13
+ ldr q20, [x3, #128] //load h5l | h5h
+ ext v20.16b, v20.16b, v20.16b, #8
+ ldr q22, [x3, #160] //load h6l | h6h
+ ext v22.16b, v22.16b, v22.16b, #8
+ rev64 v13.16b, v13.16b //GHASH block 8k+5
+
+ rev64 v12.16b, v12.16b //GHASH block 8k+4
+
+ rev64 v14.16b, v14.16b //GHASH block 8k+6
+
+ ldr q21, [x3, #144] //load h6k | h5k
+ ldr q24, [x3, #192] //load h8k | h7k
+ rev32 v6.16b, v30.16b //CTR block 8k+14
+ add v30.4s, v30.4s, v31.4s //CTR block 8k+14
+
+ pmull2 v16.1q, v9.2d, v23.2d //GHASH block 8k+1 - high
+ pmull v19.1q, v8.1d, v25.1d //GHASH block 8k - low
+ pmull2 v17.1q, v8.2d, v25.2d //GHASH block 8k - high
+
+ trn1 v18.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
+ trn2 v8.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
+ pmull2 v29.1q, v10.2d, v22.2d //GHASH block 8k+2 - high
+
+ pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low
+ pmull2 v9.1q, v11.2d, v20.2d //GHASH block 8k+3 - high
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 0
+
+ eor v17.16b, v17.16b, v16.16b //GHASH block 8k+1 - high
+ aese v4.16b, v26.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 0
+ eor v8.16b, v8.16b, v18.16b //GHASH block 8k, 8k+1 - mid
+
+ pmull v22.1q, v10.1d, v22.1d //GHASH block 8k+2 - low
+ rev32 v7.16b, v30.16b //CTR block 8k+15
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 0
+
+.inst 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b, v9.16b //GHASH block 8k+2, 8k+3 - high
+ trn1 v29.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
+ trn2 v10.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
+
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 0
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 0
+ aese v5.16b, v26.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 0
+
+ pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid
+ pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid
+ pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low
+
+ aese v2.16b, v27.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 1
+ aese v7.16b, v26.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 0
+ aese v6.16b, v26.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 0
+
+ eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low
+ eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
+ eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid
+
+ aese v6.16b, v27.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 1
+ aese v4.16b, v27.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 1
+ aese v5.16b, v27.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 1
+
+ ldp q28, q26, [x8, #32] //load rk2, rk3
+.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low
+ pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid
+
+ ldr q23, [x3, #80] //load h3l | h3h
+ ext v23.16b, v23.16b, v23.16b, #8
+ ldr q25, [x3, #112] //load h4l | h4h
+ ext v25.16b, v25.16b, v25.16b, #8
+ aese v1.16b, v27.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 1
+ pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid
+
+ aese v3.16b, v27.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 1
+ aese v7.16b, v27.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 1
+ aese v0.16b, v27.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 1
+
+ ldr q20, [x3, #32] //load h1l | h1h
+ ext v20.16b, v20.16b, v20.16b, #8
+ ldr q22, [x3, #64] //load h2l | h2h
+ ext v22.16b, v22.16b, v22.16b, #8
+.inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
+
+ aese v0.16b, v28.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 2
+ aese v6.16b, v28.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 2
+ aese v2.16b, v28.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 2
+
+ aese v4.16b, v28.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 2
+ trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
+ aese v7.16b, v28.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 2
+
+ aese v1.16b, v28.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 2
+ aese v5.16b, v28.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 2
+ aese v3.16b, v28.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 2
+
+ pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high
+ pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low
+ trn2 v12.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
+
+ ldp q27, q28, [x8, #64] //load rk4, rk5
+ rev64 v15.16b, v15.16b //GHASH block 8k+7
+ aese v6.16b, v26.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 3
+
+ ldr q21, [x3, #48] //load h2k | h1k
+ ldr q24, [x3, #96] //load h4k | h3k
+ pmull2 v10.1q, v13.2d, v23.2d //GHASH block 8k+5 - high
+ pmull v23.1q, v13.1d, v23.1d //GHASH block 8k+5 - low
+
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 3
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 3
+ trn1 v13.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
+
+ pmull2 v11.1q, v14.2d, v22.2d //GHASH block 8k+6 - high
+ pmull v22.1q, v14.1d, v22.1d //GHASH block 8k+6 - low
+ trn2 v14.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
+
+ aese v4.16b, v26.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 3
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 3
+ aese v7.16b, v26.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 3
+
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 3
+ aese v5.16b, v26.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 3
+ eor v12.16b, v12.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
+
+.inst 0xce082a31 //eor3 v17.16b, v17.16b, v8.16b, v10.16b //GHASH block 8k+4, 8k+5 - high
+ aese v0.16b, v27.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 4
+ aese v2.16b, v27.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 4
+
+ eor v14.16b, v14.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
+ aese v5.16b, v27.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 4
+ pmull2 v16.1q, v12.2d, v24.2d //GHASH block 8k+4 - mid
+
+ aese v1.16b, v27.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 4
+ aese v6.16b, v27.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 4
+ aese v4.16b, v27.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 4
+
+ aese v7.16b, v27.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 4
+ aese v3.16b, v27.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 4
+ pmull v24.1q, v12.1d, v24.1d //GHASH block 8k+5 - mid
+
+ pmull2 v12.1q, v15.2d, v20.2d //GHASH block 8k+7 - high
+ pmull2 v13.1q, v14.2d, v21.2d //GHASH block 8k+6 - mid
+ pmull v21.1q, v14.1d, v21.1d //GHASH block 8k+7 - mid
+
+ ldp q26, q27, [x8, #96] //load rk6, rk7
+.inst 0xce184252 //eor3 v18.16b, v18.16b, v24.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
+ aese v6.16b, v28.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 5
+
+ ldr d16, [x10] //MODULO - load modulo constant
+ pmull v20.1q, v15.1d, v20.1d //GHASH block 8k+7 - low
+.inst 0xce195e73 //eor3 v19.16b, v19.16b, v25.16b, v23.16b //GHASH block 8k+4, 8k+5 - low
+
+ aese v0.16b, v28.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 5
+ aese v2.16b, v28.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 5
+ aese v4.16b, v28.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 5
+
+ aese v3.16b, v28.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 5
+ aese v1.16b, v28.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 5
+ aese v5.16b, v28.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 5
+
+ aese v7.16b, v28.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 5
+.inst 0xce153652 //eor3 v18.16b, v18.16b, v21.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
+.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+6, 8k+7 - low
+
+ aese v4.16b, v26.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 6
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 6
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 6
+
+.inst 0xce0b3231 //eor3 v17.16b, v17.16b, v11.16b, v12.16b //GHASH block 8k+6, 8k+7 - high
+ aese v5.16b, v26.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 6
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 6
+
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 6
+ aese v6.16b, v26.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 6
+ aese v7.16b, v26.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 6
+
+ aese v4.16b, v27.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 7
+.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up
+ ldp q28, q26, [x8, #128] //load rk8, rk9
+
+ pmull v29.1q, v17.1d, v16.1d //MODULO - top 64b align with mid
+ aese v3.16b, v27.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 7
+ ext v21.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment
+
+ aese v5.16b, v27.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 7
+ aese v6.16b, v27.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 7
+ aese v0.16b, v27.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 7
+
+ aese v7.16b, v27.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 7
+ aese v1.16b, v27.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 7
+ aese v2.16b, v27.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 7
+
+.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid
+ ldr q27, [x8, #160] //load rk10
+
+ aese v3.16b, v28.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 8
+ aese v0.16b, v28.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 8
+
+ pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low
+ aese v6.16b, v28.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 8
+ ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment
+
+ aese v2.16b, v28.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 8
+ aese v1.16b, v28.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 8
+ aese v7.16b, v28.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 8
+
+ aese v6.16b, v26.16b //AES block 8k+14 - round 9
+ aese v5.16b, v28.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 8
+ aese v4.16b, v28.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 8
+
+.inst 0xce115673 //eor3 v19.16b, v19.16b, v17.16b, v21.16b //MODULO - fold into low
+ add v30.4s, v30.4s, v31.4s //CTR block 8k+15
+ aese v2.16b, v26.16b //AES block 8k+10 - round 9
+
+ aese v3.16b, v26.16b //AES block 8k+11 - round 9
+ aese v5.16b, v26.16b //AES block 8k+13 - round 9
+ aese v0.16b, v26.16b //AES block 8k+8 - round 9
+
+ aese v4.16b, v26.16b //AES block 8k+12 - round 9
+ aese v1.16b, v26.16b //AES block 8k+9 - round 9
+ aese v7.16b, v26.16b //AES block 8k+15 - round 9
+
+.L128_dec_tail: //TAIL
+
+ mov v29.16b, v27.16b
+ sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process
+
+ cmp x5, #112
+
+ ldp q24, q25, [x3, #192] //load h8k | h7k
+ ext v25.16b, v25.16b, v25.16b, #8
+ ldr q9, [x0], #16 //AES block 8k+8 - load ciphertext
+
+ ldp q20, q21, [x3, #128] //load h5l | h5h
+ ext v20.16b, v20.16b, v20.16b, #8
+ ext v16.16b, v19.16b, v19.16b, #8 //prepare final partial tag
+
+ ldp q22, q23, [x3, #160] //load h6l | h6h
+ ext v22.16b, v22.16b, v22.16b, #8
+ ext v23.16b, v23.16b, v23.16b, #8
+
+.inst 0xce00752c //eor3 v12.16b, v9.16b, v0.16b, v29.16b //AES block 8k+8 - result
+ b.gt .L128_dec_blocks_more_than_7
+
+ cmp x5, #96
+ mov v7.16b, v6.16b
+ movi v19.8b, #0
+
+ movi v17.8b, #0
+ mov v6.16b, v5.16b
+ mov v5.16b, v4.16b
+
+ mov v4.16b, v3.16b
+ mov v3.16b, v2.16b
+ mov v2.16b, v1.16b
+
+ movi v18.8b, #0
+ sub v30.4s, v30.4s, v31.4s
+ b.gt .L128_dec_blocks_more_than_6
+
+ cmp x5, #80
+ sub v30.4s, v30.4s, v31.4s
+
+ mov v7.16b, v6.16b
+ mov v6.16b, v5.16b
+ mov v5.16b, v4.16b
+
+ mov v4.16b, v3.16b
+ mov v3.16b, v1.16b
+ b.gt .L128_dec_blocks_more_than_5
+
+ cmp x5, #64
+
+ mov v7.16b, v6.16b
+ mov v6.16b, v5.16b
+ mov v5.16b, v4.16b
+
+ mov v4.16b, v1.16b
+ sub v30.4s, v30.4s, v31.4s
+ b.gt .L128_dec_blocks_more_than_4
+
+ sub v30.4s, v30.4s, v31.4s
+ mov v7.16b, v6.16b
+ mov v6.16b, v5.16b
+
+ mov v5.16b, v1.16b
+ cmp x5, #48
+ b.gt .L128_dec_blocks_more_than_3
+
+ sub v30.4s, v30.4s, v31.4s
+ mov v7.16b, v6.16b
+ cmp x5, #32
+
+ ldr q24, [x3, #96] //load h4k | h3k
+ mov v6.16b, v1.16b
+ b.gt .L128_dec_blocks_more_than_2
+
+ cmp x5, #16
+
+ mov v7.16b, v1.16b
+ sub v30.4s, v30.4s, v31.4s
+ b.gt .L128_dec_blocks_more_than_1
+
+ sub v30.4s, v30.4s, v31.4s
+ ldr q21, [x3, #48] //load h2k | h1k
+ b .L128_dec_blocks_less_than_1
+.L128_dec_blocks_more_than_7: //blocks left > 7
+ rev64 v8.16b, v9.16b //GHASH final-7 block
+
+ eor v8.16b, v8.16b, v16.16b //feed in partial tag
+
+ ins v18.d[0], v24.d[1] //GHASH final-7 block - mid
+
+ pmull v19.1q, v8.1d, v25.1d //GHASH final-7 block - low
+ ins v27.d[0], v8.d[1] //GHASH final-7 block - mid
+
+ movi v16.8b, #0 //suppress further partial tag feed in
+ ldr q9, [x0], #16 //AES final-6 block - load ciphertext
+
+ eor v27.8b, v27.8b, v8.8b //GHASH final-7 block - mid
+
+ pmull2 v17.1q, v8.2d, v25.2d //GHASH final-7 block - high
+ st1 { v12.16b}, [x2], #16 //AES final-7 block - store result
+.inst 0xce01752c //eor3 v12.16b, v9.16b, v1.16b, v29.16b //AES final-6 block - result
+
+ pmull v18.1q, v27.1d, v18.1d //GHASH final-7 block - mid
+.L128_dec_blocks_more_than_6: //blocks left > 6
+
+ rev64 v8.16b, v9.16b //GHASH final-6 block
+
+ eor v8.16b, v8.16b, v16.16b //feed in partial tag
+
+ ins v27.d[0], v8.d[1] //GHASH final-6 block - mid
+
+ eor v27.8b, v27.8b, v8.8b //GHASH final-6 block - mid
+
+ pmull v26.1q, v8.1d, v23.1d //GHASH final-6 block - low
+ ldr q9, [x0], #16 //AES final-5 block - load ciphertext
+ movi v16.8b, #0 //suppress further partial tag feed in
+
+ pmull v27.1q, v27.1d, v24.1d //GHASH final-6 block - mid
+ st1 { v12.16b}, [x2], #16 //AES final-6 block - store result
+ pmull2 v28.1q, v8.2d, v23.2d //GHASH final-6 block - high
+
+ eor v19.16b, v19.16b, v26.16b //GHASH final-6 block - low
+ eor v17.16b, v17.16b, v28.16b //GHASH final-6 block - high
+
+ eor v18.16b, v18.16b, v27.16b //GHASH final-6 block - mid
+.inst 0xce02752c //eor3 v12.16b, v9.16b, v2.16b, v29.16b //AES final-5 block - result
+.L128_dec_blocks_more_than_5: //blocks left > 5
+
+ rev64 v8.16b, v9.16b //GHASH final-5 block
+
+ ldr q9, [x0], #16 //AES final-4 block - load ciphertext
+ st1 { v12.16b}, [x2], #16 //AES final-5 block - store result
+
+ eor v8.16b, v8.16b, v16.16b //feed in partial tag
+
+ ins v27.d[0], v8.d[1] //GHASH final-5 block - mid
+
+.inst 0xce03752c //eor3 v12.16b, v9.16b, v3.16b, v29.16b //AES final-4 block - result
+
+ eor v27.8b, v27.8b, v8.8b //GHASH final-5 block - mid
+
+ ins v27.d[1], v27.d[0] //GHASH final-5 block - mid
+ pmull v26.1q, v8.1d, v22.1d //GHASH final-5 block - low
+ movi v16.8b, #0 //suppress further partial tag feed in
+
+ pmull2 v27.1q, v27.2d, v21.2d //GHASH final-5 block - mid
+ pmull2 v28.1q, v8.2d, v22.2d //GHASH final-5 block - high
+ eor v19.16b, v19.16b, v26.16b //GHASH final-5 block - low
+
+ eor v18.16b, v18.16b, v27.16b //GHASH final-5 block - mid
+ eor v17.16b, v17.16b, v28.16b //GHASH final-5 block - high
+.L128_dec_blocks_more_than_4: //blocks left > 4
+
+ rev64 v8.16b, v9.16b //GHASH final-4 block
+
+ eor v8.16b, v8.16b, v16.16b //feed in partial tag
+ ldr q9, [x0], #16 //AES final-3 block - load ciphertext
+
+ ins v27.d[0], v8.d[1] //GHASH final-4 block - mid
+ movi v16.8b, #0 //suppress further partial tag feed in
+ pmull2 v28.1q, v8.2d, v20.2d //GHASH final-4 block - high
+
+ pmull v26.1q, v8.1d, v20.1d //GHASH final-4 block - low
+
+ eor v17.16b, v17.16b, v28.16b //GHASH final-4 block - high
+
+ st1 { v12.16b}, [x2], #16 //AES final-4 block - store result
+ eor v27.8b, v27.8b, v8.8b //GHASH final-4 block - mid
+
+.inst 0xce04752c //eor3 v12.16b, v9.16b, v4.16b, v29.16b //AES final-3 block - result
+ eor v19.16b, v19.16b, v26.16b //GHASH final-4 block - low
+
+ pmull v27.1q, v27.1d, v21.1d //GHASH final-4 block - mid
+
+ eor v18.16b, v18.16b, v27.16b //GHASH final-4 block - mid
+.L128_dec_blocks_more_than_3: //blocks left > 3
+
+ st1 { v12.16b}, [x2], #16 //AES final-3 block - store result
+ rev64 v8.16b, v9.16b //GHASH final-3 block
+
+ eor v8.16b, v8.16b, v16.16b //feed in partial tag
+
+ ins v27.d[0], v8.d[1] //GHASH final-3 block - mid
+
+ ldr q25, [x3, #112] //load h4l | h4h
+ ext v25.16b, v25.16b, v25.16b, #8
+ ldr q24, [x3, #96] //load h4k | h3k
+
+ eor v27.8b, v27.8b, v8.8b //GHASH final-3 block - mid
+
+ ldr q9, [x0], #16 //AES final-2 block - load ciphertext
+
+ ins v27.d[1], v27.d[0] //GHASH final-3 block - mid
+ pmull v26.1q, v8.1d, v25.1d //GHASH final-3 block - low
+ pmull2 v28.1q, v8.2d, v25.2d //GHASH final-3 block - high
+
+ movi v16.8b, #0 //suppress further partial tag feed in
+.inst 0xce05752c //eor3 v12.16b, v9.16b, v5.16b, v29.16b //AES final-2 block - result
+ eor v19.16b, v19.16b, v26.16b //GHASH final-3 block - low
+
+ pmull2 v27.1q, v27.2d, v24.2d //GHASH final-3 block - mid
+
+ eor v17.16b, v17.16b, v28.16b //GHASH final-3 block - high
+ eor v18.16b, v18.16b, v27.16b //GHASH final-3 block - mid
+.L128_dec_blocks_more_than_2: //blocks left > 2
+
+ rev64 v8.16b, v9.16b //GHASH final-2 block
+
+ st1 { v12.16b}, [x2], #16 //AES final-2 block - store result
+
+ eor v8.16b, v8.16b, v16.16b //feed in partial tag
+ ldr q23, [x3, #80] //load h3l | h3h
+ ext v23.16b, v23.16b, v23.16b, #8
+ movi v16.8b, #0 //suppress further partial tag feed in
+
+ ins v27.d[0], v8.d[1] //GHASH final-2 block - mid
+
+ eor v27.8b, v27.8b, v8.8b //GHASH final-2 block - mid
+
+ pmull v26.1q, v8.1d, v23.1d //GHASH final-2 block - low
+
+ pmull2 v28.1q, v8.2d, v23.2d //GHASH final-2 block - high
+ pmull v27.1q, v27.1d, v24.1d //GHASH final-2 block - mid
+ ldr q9, [x0], #16 //AES final-1 block - load ciphertext
+
+ eor v18.16b, v18.16b, v27.16b //GHASH final-2 block - mid
+
+ eor v19.16b, v19.16b, v26.16b //GHASH final-2 block - low
+
+.inst 0xce06752c //eor3 v12.16b, v9.16b, v6.16b, v29.16b //AES final-1 block - result
+ eor v17.16b, v17.16b, v28.16b //GHASH final-2 block - high
+.L128_dec_blocks_more_than_1: //blocks left > 1
+
+ st1 { v12.16b}, [x2], #16 //AES final-1 block - store result
+ rev64 v8.16b, v9.16b //GHASH final-1 block
+
+ ldr q22, [x3, #64] //load h2l | h2h
+ ext v22.16b, v22.16b, v22.16b, #8
+
+ eor v8.16b, v8.16b, v16.16b //feed in partial tag
+
+ movi v16.8b, #0 //suppress further partial tag feed in
+
+ ins v27.d[0], v8.d[1] //GHASH final-1 block - mid
+
+ ldr q9, [x0], #16 //AES final block - load ciphertext
+ pmull2 v28.1q, v8.2d, v22.2d //GHASH final-1 block - high
+
+ eor v27.8b, v27.8b, v8.8b //GHASH final-1 block - mid
+ eor v17.16b, v17.16b, v28.16b //GHASH final-1 block - high
+ ldr q21, [x3, #48] //load h2k | h1k
+
+ ins v27.d[1], v27.d[0] //GHASH final-1 block - mid
+.inst 0xce07752c //eor3 v12.16b, v9.16b, v7.16b, v29.16b //AES final block - result
+
+ pmull v26.1q, v8.1d, v22.1d //GHASH final-1 block - low
+
+ pmull2 v27.1q, v27.2d, v21.2d //GHASH final-1 block - mid
+
+ eor v19.16b, v19.16b, v26.16b //GHASH final-1 block - low
+
+ eor v18.16b, v18.16b, v27.16b //GHASH final-1 block - mid
+.L128_dec_blocks_less_than_1: //blocks left <= 1
+
+ and x1, x1, #127 //bit_length %= 128
+
+ sub x1, x1, #128 //bit_length -= 128
+
+ neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128])
+
+ mvn x6, xzr //temp0_x = 0xffffffffffffffff
+ and x1, x1, #127 //bit_length %= 128
+
+ lsr x6, x6, x1 //temp0_x is mask for top 64b of last block
+ cmp x1, #64
+ mvn x7, xzr //temp1_x = 0xffffffffffffffff
+
+ csel x13, x7, x6, lt
+ csel x14, x6, xzr, lt
+
+ mov v0.d[1], x14
+ mov v0.d[0], x13 //ctr0b is mask for last block
+
+ ldr q20, [x3, #32] //load h1l | h1h
+ ext v20.16b, v20.16b, v20.16b, #8
+ ld1 { v26.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored
+
+ and v9.16b, v9.16b, v0.16b //possibly partial last block has zeroes in highest bits
+
+ rev64 v8.16b, v9.16b //GHASH final block
+
+ eor v8.16b, v8.16b, v16.16b //feed in partial tag
+
+ pmull2 v28.1q, v8.2d, v20.2d //GHASH final block - high
+ ins v16.d[0], v8.d[1] //GHASH final block - mid
+
+ eor v17.16b, v17.16b, v28.16b //GHASH final block - high
+ eor v16.8b, v16.8b, v8.8b //GHASH final block - mid
+
+ bif v12.16b, v26.16b, v0.16b //insert existing bytes in top end of result before storing
+
+ pmull v16.1q, v16.1d, v21.1d //GHASH final block - mid
+ st1 { v12.16b}, [x2] //store all 16B
+
+ pmull v26.1q, v8.1d, v20.1d //GHASH final block - low
+
+ eor v18.16b, v18.16b, v16.16b //GHASH final block - mid
+ ldr d16, [x10] //MODULO - load modulo constant
+
+ eor v19.16b, v19.16b, v26.16b //GHASH final block - low
+
+ eor v14.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up
+
+ pmull v21.1q, v17.1d, v16.1d //MODULO - top 64b align with mid
+ ext v17.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment
+
+ eor v18.16b, v18.16b, v14.16b //MODULO - karatsuba tidy up
+
+.inst 0xce115652 //eor3 v18.16b, v18.16b, v17.16b, v21.16b //MODULO - fold into mid
+
+ pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low
+ ext v18.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment
+
+.inst 0xce124673 //eor3 v19.16b, v19.16b, v18.16b, v17.16b //MODULO - fold into low
+ ext v19.16b, v19.16b, v19.16b, #8
+ rev64 v19.16b, v19.16b
+ st1 { v19.16b }, [x3]
+ rev32 v30.16b, v30.16b
+
+ str q30, [x16] //store the updated counter
+
+ mov x0, x9
+
+ ldp d10, d11, [sp, #16]
+ ldp d12, d13, [sp, #32]
+ ldp d14, d15, [sp, #48]
+ ldp d8, d9, [sp], #80
+ ret
+.L128_dec_ret:
+ mov w0, #0x0
+ ret
+.size unroll8_eor3_aes_gcm_dec_128_kernel,.-unroll8_eor3_aes_gcm_dec_128_kernel
+.globl unroll8_eor3_aes_gcm_enc_192_kernel
+.type unroll8_eor3_aes_gcm_enc_192_kernel,%function
+.align 4
+unroll8_eor3_aes_gcm_enc_192_kernel:
+ AARCH64_VALID_CALL_TARGET
+ cbz x1, .L192_enc_ret
+ stp d8, d9, [sp, #-80]!
+ lsr x9, x1, #3
+ mov x16, x4
+ mov x8, x5
+ stp d10, d11, [sp, #16]
+ stp d12, d13, [sp, #32]
+ stp d14, d15, [sp, #48]
+ mov x5, #0xc200000000000000
+ stp x5, xzr, [sp, #64]
+ add x10, sp, #64
+
+ mov x5, x9
+ ld1 { v0.16b}, [x16] //CTR block 0
+
+ mov x15, #0x100000000 //set up counter increment
+ movi v31.16b, #0x0
+ mov v31.d[1], x15
+
+ rev32 v30.16b, v0.16b //set up reversed counter
+
+ add v30.4s, v30.4s, v31.4s //CTR block 0
+
+ rev32 v1.16b, v30.16b //CTR block 1
+ add v30.4s, v30.4s, v31.4s //CTR block 1
+
+ rev32 v2.16b, v30.16b //CTR block 2
+ add v30.4s, v30.4s, v31.4s //CTR block 2
+
+ rev32 v3.16b, v30.16b //CTR block 3
+ add v30.4s, v30.4s, v31.4s //CTR block 3
+
+ rev32 v4.16b, v30.16b //CTR block 4
+ add v30.4s, v30.4s, v31.4s //CTR block 4
+ sub x5, x5, #1 //byte_len - 1
+
+ and x5, x5, #0xffffffffffffff80 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
+
+ rev32 v5.16b, v30.16b //CTR block 5
+ add v30.4s, v30.4s, v31.4s //CTR block 5
+ ldp q26, q27, [x8, #0] //load rk0, rk1
+
+ add x5, x5, x0
+
+ rev32 v6.16b, v30.16b //CTR block 6
+ add v30.4s, v30.4s, v31.4s //CTR block 6
+
+ rev32 v7.16b, v30.16b //CTR block 7
+
+ aese v5.16b, v26.16b
+ aesmc v5.16b, v5.16b //AES block 5 - round 0
+ aese v4.16b, v26.16b
+ aesmc v4.16b, v4.16b //AES block 4 - round 0
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b //AES block 3 - round 0
+
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b //AES block 0 - round 0
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b //AES block 1 - round 0
+ aese v7.16b, v26.16b
+ aesmc v7.16b, v7.16b //AES block 7 - round 0
+
+ aese v6.16b, v26.16b
+ aesmc v6.16b, v6.16b //AES block 6 - round 0
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b //AES block 2 - round 0
+ ldp q28, q26, [x8, #32] //load rk2, rk3
+
+ aese v5.16b, v27.16b
+ aesmc v5.16b, v5.16b //AES block 5 - round 1
+ aese v7.16b, v27.16b
+ aesmc v7.16b, v7.16b //AES block 7 - round 1
+
+ aese v2.16b, v27.16b
+ aesmc v2.16b, v2.16b //AES block 2 - round 1
+ aese v3.16b, v27.16b
+ aesmc v3.16b, v3.16b //AES block 3 - round 1
+ aese v6.16b, v27.16b
+ aesmc v6.16b, v6.16b //AES block 6 - round 1
+
+ aese v5.16b, v28.16b
+ aesmc v5.16b, v5.16b //AES block 5 - round 2
+ aese v4.16b, v27.16b
+ aesmc v4.16b, v4.16b //AES block 4 - round 1
+ aese v0.16b, v27.16b
+ aesmc v0.16b, v0.16b //AES block 0 - round 1
+
+ aese v1.16b, v27.16b
+ aesmc v1.16b, v1.16b //AES block 1 - round 1
+ aese v7.16b, v28.16b
+ aesmc v7.16b, v7.16b //AES block 7 - round 2
+ aese v3.16b, v28.16b
+ aesmc v3.16b, v3.16b //AES block 3 - round 2
+
+ aese v2.16b, v28.16b
+ aesmc v2.16b, v2.16b //AES block 2 - round 2
+ aese v0.16b, v28.16b
+ aesmc v0.16b, v0.16b //AES block 0 - round 2
+
+ aese v1.16b, v28.16b
+ aesmc v1.16b, v1.16b //AES block 1 - round 2
+ aese v4.16b, v28.16b
+ aesmc v4.16b, v4.16b //AES block 4 - round 2
+ aese v6.16b, v28.16b
+ aesmc v6.16b, v6.16b //AES block 6 - round 2
+
+ ldp q27, q28, [x8, #64] //load rk4, rk5
+ aese v4.16b, v26.16b
+ aesmc v4.16b, v4.16b //AES block 4 - round 3
+
+ aese v7.16b, v26.16b
+ aesmc v7.16b, v7.16b //AES block 7 - round 3
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b //AES block 3 - round 3
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b //AES block 2 - round 3
+
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b //AES block 1 - round 3
+
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b //AES block 0 - round 3
+
+ aese v6.16b, v26.16b
+ aesmc v6.16b, v6.16b //AES block 6 - round 3
+
+ aese v0.16b, v27.16b
+ aesmc v0.16b, v0.16b //AES block 0 - round 4
+ aese v1.16b, v27.16b
+ aesmc v1.16b, v1.16b //AES block 1 - round 4
+ aese v5.16b, v26.16b
+ aesmc v5.16b, v5.16b //AES block 5 - round 3
+
+ aese v3.16b, v27.16b
+ aesmc v3.16b, v3.16b //AES block 3 - round 4
+ aese v2.16b, v27.16b
+ aesmc v2.16b, v2.16b //AES block 2 - round 4
+ aese v4.16b, v27.16b
+ aesmc v4.16b, v4.16b //AES block 4 - round 4
+
+ aese v6.16b, v27.16b
+ aesmc v6.16b, v6.16b //AES block 6 - round 4
+ aese v7.16b, v27.16b
+ aesmc v7.16b, v7.16b //AES block 7 - round 4
+ aese v5.16b, v27.16b
+ aesmc v5.16b, v5.16b //AES block 5 - round 4
+
+ aese v1.16b, v28.16b
+ aesmc v1.16b, v1.16b //AES block 1 - round 5
+ ldp q26, q27, [x8, #96] //load rk6, rk7
+ aese v2.16b, v28.16b
+ aesmc v2.16b, v2.16b //AES block 2 - round 5
+
+ aese v4.16b, v28.16b
+ aesmc v4.16b, v4.16b //AES block 4 - round 5
+ aese v7.16b, v28.16b
+ aesmc v7.16b, v7.16b //AES block 7 - round 5
+ aese v0.16b, v28.16b
+ aesmc v0.16b, v0.16b //AES block 0 - round 5
+
+ aese v5.16b, v28.16b
+ aesmc v5.16b, v5.16b //AES block 5 - round 5
+ aese v6.16b, v28.16b
+ aesmc v6.16b, v6.16b //AES block 6 - round 5
+ aese v3.16b, v28.16b
+ aesmc v3.16b, v3.16b //AES block 3 - round 5
+
+ add v30.4s, v30.4s, v31.4s //CTR block 7
+
+ aese v5.16b, v26.16b
+ aesmc v5.16b, v5.16b //AES block 5 - round 6
+ aese v4.16b, v26.16b
+ aesmc v4.16b, v4.16b //AES block 4 - round 6
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b //AES block 3 - round 6
+
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b //AES block 2 - round 6
+ aese v6.16b, v26.16b
+ aesmc v6.16b, v6.16b //AES block 6 - round 6
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b //AES block 1 - round 6
+
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b //AES block 0 - round 6
+ aese v7.16b, v26.16b
+ aesmc v7.16b, v7.16b //AES block 7 - round 6
+ ldp q28, q26, [x8, #128] //load rk8, rk9
+
+ aese v6.16b, v27.16b
+ aesmc v6.16b, v6.16b //AES block 6 - round 7
+ aese v3.16b, v27.16b
+ aesmc v3.16b, v3.16b //AES block 3 - round 7
+
+ aese v4.16b, v27.16b
+ aesmc v4.16b, v4.16b //AES block 4 - round 7
+ aese v0.16b, v27.16b
+ aesmc v0.16b, v0.16b //AES block 0 - round 7
+
+ aese v7.16b, v27.16b
+ aesmc v7.16b, v7.16b //AES block 7 - round 7
+ aese v1.16b, v27.16b
+ aesmc v1.16b, v1.16b //AES block 1 - round 7
+
+ aese v2.16b, v27.16b
+ aesmc v2.16b, v2.16b //AES block 2 - round 7
+ aese v5.16b, v27.16b
+ aesmc v5.16b, v5.16b //AES block 5 - round 7
+
+ aese v7.16b, v28.16b
+ aesmc v7.16b, v7.16b //AES block 7 - round 8
+ aese v0.16b, v28.16b
+ aesmc v0.16b, v0.16b //AES block 0 - round 8
+
+ aese v4.16b, v28.16b
+ aesmc v4.16b, v4.16b //AES block 4 - round 8
+ aese v3.16b, v28.16b
+ aesmc v3.16b, v3.16b //AES block 3 - round 8
+ aese v5.16b, v28.16b
+ aesmc v5.16b, v5.16b //AES block 5 - round 8
+
+ aese v2.16b, v28.16b
+ aesmc v2.16b, v2.16b //AES block 2 - round 8
+ aese v1.16b, v28.16b
+ aesmc v1.16b, v1.16b //AES block 1 - round 8
+ aese v6.16b, v28.16b
+ aesmc v6.16b, v6.16b //AES block 6 - round 8
+
+ add x4, x0, x1, lsr #3 //end_input_ptr
+ cmp x0, x5 //check if we have <= 8 blocks
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b //AES block 3 - round 9
+
+ ld1 { v19.16b}, [x3]
+ ext v19.16b, v19.16b, v19.16b, #8
+ rev64 v19.16b, v19.16b
+ ldp q27, q28, [x8, #160] //load rk10, rk11
+
+ aese v6.16b, v26.16b
+ aesmc v6.16b, v6.16b //AES block 6 - round 9
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b //AES block 1 - round 9
+
+ aese v5.16b, v26.16b
+ aesmc v5.16b, v5.16b //AES block 5 - round 9
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b //AES block 2 - round 9
+
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b //AES block 0 - round 9
+ aese v4.16b, v26.16b
+ aesmc v4.16b, v4.16b //AES block 4 - round 9
+
+ aese v6.16b, v27.16b
+ aesmc v6.16b, v6.16b //AES block 14 - round 10
+ aese v7.16b, v26.16b
+ aesmc v7.16b, v7.16b //AES block 7 - round 9
+ aese v3.16b, v27.16b
+ aesmc v3.16b, v3.16b //AES block 11 - round 10
+
+ aese v1.16b, v27.16b
+ aesmc v1.16b, v1.16b //AES block 9 - round 10
+ aese v5.16b, v27.16b
+ aesmc v5.16b, v5.16b //AES block 13 - round 10
+ aese v4.16b, v27.16b
+ aesmc v4.16b, v4.16b //AES block 12 - round 10
+
+ aese v0.16b, v27.16b
+ aesmc v0.16b, v0.16b //AES block 8 - round 10
+ aese v2.16b, v27.16b
+ aesmc v2.16b, v2.16b //AES block 10 - round 10
+ aese v7.16b, v27.16b
+ aesmc v7.16b, v7.16b //AES block 15 - round 10
+
+ aese v6.16b, v28.16b //AES block 14 - round 11
+ aese v3.16b, v28.16b //AES block 11 - round 11
+
+ aese v4.16b, v28.16b //AES block 12 - round 11
+ aese v7.16b, v28.16b //AES block 15 - round 11
+ ldr q26, [x8, #192] //load rk12
+
+ aese v1.16b, v28.16b //AES block 9 - round 11
+ aese v5.16b, v28.16b //AES block 13 - round 11
+
+ aese v2.16b, v28.16b //AES block 10 - round 11
+ aese v0.16b, v28.16b //AES block 8 - round 11
+ b.ge .L192_enc_tail //handle tail
+
+ ldp q8, q9, [x0], #32 //AES block 0, 1 - load plaintext
+
+ ldp q10, q11, [x0], #32 //AES block 2, 3 - load plaintext
+
+ ldp q12, q13, [x0], #32 //AES block 4, 5 - load plaintext
+
+ ldp q14, q15, [x0], #32 //AES block 6, 7 - load plaintext
+
+.inst 0xce006908 //eor3 v8.16b, v8.16b, v0.16b, v26.16b //AES block 0 - result
+ rev32 v0.16b, v30.16b //CTR block 8
+ add v30.4s, v30.4s, v31.4s //CTR block 8
+
+.inst 0xce03696b //eor3 v11.16b, v11.16b, v3.16b, v26.16b //AES block 3 - result
+.inst 0xce016929 //eor3 v9.16b, v9.16b, v1.16b, v26.16b //AES block 1 - result
+
+ rev32 v1.16b, v30.16b //CTR block 9
+ add v30.4s, v30.4s, v31.4s //CTR block 9
+.inst 0xce04698c //eor3 v12.16b, v12.16b, v4.16b, v26.16b //AES block 4 - result
+
+.inst 0xce0569ad //eor3 v13.16b, v13.16b, v5.16b, v26.16b //AES block 5 - result
+.inst 0xce0769ef //eor3 v15.16b, v15.16b, v7.16b, v26.16b //AES block 7 - result
+ stp q8, q9, [x2], #32 //AES block 0, 1 - store result
+
+.inst 0xce02694a //eor3 v10.16b, v10.16b, v2.16b, v26.16b //AES block 2 - result
+ rev32 v2.16b, v30.16b //CTR block 10
+ add v30.4s, v30.4s, v31.4s //CTR block 10
+
+ stp q10, q11, [x2], #32 //AES block 2, 3 - store result
+ cmp x0, x5 //check if we have <= 8 blocks
+
+ rev32 v3.16b, v30.16b //CTR block 11
+ add v30.4s, v30.4s, v31.4s //CTR block 11
+.inst 0xce0669ce //eor3 v14.16b, v14.16b, v6.16b, v26.16b //AES block 6 - result
+
+ stp q12, q13, [x2], #32 //AES block 4, 5 - store result
+
+ rev32 v4.16b, v30.16b //CTR block 12
+ stp q14, q15, [x2], #32 //AES block 6, 7 - store result
+ add v30.4s, v30.4s, v31.4s //CTR block 12
+
+ b.ge .L192_enc_prepretail //do prepretail
+
+.L192_enc_main_loop: //main loop start
+ rev64 v12.16b, v12.16b //GHASH block 8k+4 (t0, t1, and t2 free)
+ ldp q26, q27, [x8, #0] //load rk0, rk1
+ rev64 v10.16b, v10.16b //GHASH block 8k+2
+
+ rev32 v5.16b, v30.16b //CTR block 8k+13
+ add v30.4s, v30.4s, v31.4s //CTR block 8k+13
+ ldr q23, [x3, #176] //load h7l | h7h
+ ext v23.16b, v23.16b, v23.16b, #8
+ ldr q25, [x3, #208] //load h8l | h8h
+ ext v25.16b, v25.16b, v25.16b, #8
+
+ ext v19.16b, v19.16b, v19.16b, #8 //PRE 0
+ rev64 v8.16b, v8.16b //GHASH block 8k
+ ldr q20, [x3, #128] //load h5l | h5h
+ ext v20.16b, v20.16b, v20.16b, #8
+ ldr q22, [x3, #160] //load h6l | h6h
+ ext v22.16b, v22.16b, v22.16b, #8
+
+ rev64 v9.16b, v9.16b //GHASH block 8k+1
+ rev32 v6.16b, v30.16b //CTR block 8k+14
+ add v30.4s, v30.4s, v31.4s //CTR block 8k+14
+
+ eor v8.16b, v8.16b, v19.16b //PRE 1
+ rev64 v11.16b, v11.16b //GHASH block 8k+3
+ rev64 v13.16b, v13.16b //GHASH block 8k+5 (t0, t1, t2 and t3 free)
+
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 0
+ rev32 v7.16b, v30.16b //CTR block 8k+15
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 0
+
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 0
+ aese v5.16b, v26.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 0
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 0
+
+ aese v7.16b, v26.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 0
+ aese v4.16b, v26.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 0
+ aese v6.16b, v26.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 0
+
+ ldp q28, q26, [x8, #32] //load rk2, rk3
+ pmull2 v17.1q, v8.2d, v25.2d //GHASH block 8k - high
+ aese v0.16b, v27.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 1
+
+ aese v4.16b, v27.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 1
+ pmull2 v16.1q, v9.2d, v23.2d //GHASH block 8k+1 - high
+ pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low
+
+ trn1 v18.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
+ aese v3.16b, v27.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 1
+ ldr q21, [x3, #144] //load h6k | h5k
+ ldr q24, [x3, #192] //load h8k | h7k
+
+ pmull2 v29.1q, v10.2d, v22.2d //GHASH block 8k+2 - high
+ pmull v19.1q, v8.1d, v25.1d //GHASH block 8k - low
+ trn2 v8.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
+
+ aese v1.16b, v27.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 1
+ aese v2.16b, v27.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 1
+ aese v5.16b, v27.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 1
+
+ eor v17.16b, v17.16b, v16.16b //GHASH block 8k+1 - high
+ aese v6.16b, v27.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 1
+ aese v7.16b, v27.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 1
+
+ pmull2 v9.1q, v11.2d, v20.2d //GHASH block 8k+3 - high
+ eor v8.16b, v8.16b, v18.16b //GHASH block 8k, 8k+1 - mid
+ aese v1.16b, v28.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 2
+
+ aese v3.16b, v28.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 2
+ aese v4.16b, v28.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 2
+ aese v6.16b, v28.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 2
+
+ aese v5.16b, v28.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 2
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 3
+.inst 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b, v9.16b //GHASH block 8k+2, 8k+3 - high
+
+ pmull v22.1q, v10.1d, v22.1d //GHASH block 8k+2 - low
+ aese v7.16b, v28.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 2
+ aese v4.16b, v26.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 3
+
+ aese v2.16b, v28.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 2
+ trn1 v29.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
+ aese v0.16b, v28.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 2
+
+ trn2 v10.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 3
+ ldp q27, q28, [x8, #64] //load rk4, rk5
+
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 3
+ eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low
+ ldr q23, [x3, #80] //load h3l | h3h
+ ext v23.16b, v23.16b, v23.16b, #8
+ ldr q25, [x3, #112] //load h4l | h4h
+ ext v25.16b, v25.16b, v25.16b, #8
+
+ pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid
+ pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid
+ pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low
+
+ aese v5.16b, v26.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 3
+ eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
+ trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
+
+ eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid
+ aese v6.16b, v26.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 3
+.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low
+
+ aese v1.16b, v27.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 4
+ aese v3.16b, v27.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 4
+ aese v7.16b, v26.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 3
+
+ pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid
+ aese v6.16b, v27.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 4
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 3
+
+ pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid
+ aese v0.16b, v27.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 4
+ aese v4.16b, v27.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 4
+
+ aese v2.16b, v27.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 4
+ aese v5.16b, v27.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 4
+ aese v7.16b, v27.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 4
+
+.inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
+ aese v4.16b, v28.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 5
+ ldr q20, [x3, #32] //load h1l | h1h
+ ext v20.16b, v20.16b, v20.16b, #8
+ ldr q22, [x3, #64] //load h2l | h2h
+ ext v22.16b, v22.16b, v22.16b, #8
+
+ ldp q26, q27, [x8, #96] //load rk6, rk7
+ aese v2.16b, v28.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 5
+ rev64 v15.16b, v15.16b //GHASH block 8k+7 (t0, t1, t2 and t3 free)
+
+ rev64 v14.16b, v14.16b //GHASH block 8k+6 (t0, t1, and t2 free)
+ pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high
+ pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low
+
+ aese v5.16b, v28.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 5
+ trn2 v12.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
+
+ aese v6.16b, v28.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 5
+ ldr q21, [x3, #48] //load h2k | h1k
+ ldr q24, [x3, #96] //load h4k | h3k
+
+ aese v1.16b, v28.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 5
+ pmull2 v10.1q, v13.2d, v23.2d //GHASH block 8k+5 - high
+ eor v12.16b, v12.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
+
+ aese v3.16b, v28.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 5
+ aese v7.16b, v28.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 5
+ aese v0.16b, v28.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 5
+
+ pmull v23.1q, v13.1d, v23.1d //GHASH block 8k+5 - low
+ aese v4.16b, v26.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 6
+ trn1 v13.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
+
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 6
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 6
+ pmull2 v11.1q, v14.2d, v22.2d //GHASH block 8k+6 - high
+
+ pmull v22.1q, v14.1d, v22.1d //GHASH block 8k+6 - low
+ trn2 v14.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 6
+
+ aese v6.16b, v26.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 6
+ aese v5.16b, v26.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 6
+
+ aese v7.16b, v26.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 6
+ aese v2.16b, v27.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 7
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 6
+
+ aese v6.16b, v27.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 7
+ eor v14.16b, v14.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
+
+ pmull2 v16.1q, v12.2d, v24.2d //GHASH block 8k+4 - mid
+ ldp q28, q26, [x8, #128] //load rk8, rk9
+ pmull v24.1q, v12.1d, v24.1d //GHASH block 8k+5 - mid
+
+ aese v4.16b, v27.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 7
+ pmull2 v12.1q, v15.2d, v20.2d //GHASH block 8k+7 - high
+ aese v5.16b, v27.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 7
+
+.inst 0xce184252 //eor3 v18.16b, v18.16b, v24.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
+ aese v7.16b, v27.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 7
+ add v30.4s, v30.4s, v31.4s //CTR block 8k+15
+
+ ldr d16, [x10] //MODULO - load modulo constant
+.inst 0xce082a31 //eor3 v17.16b, v17.16b, v8.16b, v10.16b //GHASH block 8k+4, 8k+5 - high
+ aese v0.16b, v27.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 7
+
+ pmull2 v13.1q, v14.2d, v21.2d //GHASH block 8k+6 - mid
+ pmull v20.1q, v15.1d, v20.1d //GHASH block 8k+7 - low
+ aese v3.16b, v27.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 7
+
+ aese v5.16b, v28.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 8
+ aese v4.16b, v28.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 8
+ aese v0.16b, v28.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 8
+
+ aese v6.16b, v28.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 8
+.inst 0xce195e73 //eor3 v19.16b, v19.16b, v25.16b, v23.16b //GHASH block 8k+4, 8k+5 - low
+ aese v1.16b, v27.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 7
+
+ aese v7.16b, v28.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 8
+ aese v2.16b, v28.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 8
+ pmull v21.1q, v14.1d, v21.1d //GHASH block 8k+7 - mid
+
+ aese v1.16b, v28.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 8
+ aese v3.16b, v28.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 8
+ ldp q27, q28, [x8, #160] //load rk10, rk11
+
+.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+6, 8k+7 - low
+ rev32 v20.16b, v30.16b //CTR block 8k+16
+ add v30.4s, v30.4s, v31.4s //CTR block 8k+16
+
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 9
+.inst 0xce153652 //eor3 v18.16b, v18.16b, v21.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
+.inst 0xce0b3231 //eor3 v17.16b, v17.16b, v11.16b, v12.16b //GHASH block 8k+6, 8k+7 - high
+
+ aese v6.16b, v26.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 9
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 9
+ ldp q8, q9, [x0], #32 //AES block 8k+8, 8k+9 - load plaintext
+
+ pmull v21.1q, v17.1d, v16.1d //MODULO - top 64b align with mid
+ rev32 v22.16b, v30.16b //CTR block 8k+17
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 9
+
+ aese v4.16b, v26.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 9
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 9
+ aese v7.16b, v26.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 9
+
+.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up
+ aese v5.16b, v26.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 9
+ add v30.4s, v30.4s, v31.4s //CTR block 8k+17
+
+ aese v2.16b, v27.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 10
+ aese v4.16b, v27.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 10
+ ldr q26, [x8, #192] //load rk12
+ ext v29.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment
+
+ aese v0.16b, v27.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 10
+ aese v7.16b, v27.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 10
+ ldp q10, q11, [x0], #32 //AES block 8k+10, 8k+11 - load plaintext
+
+ aese v4.16b, v28.16b //AES block 8k+12 - round 11
+.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid
+ ldp q12, q13, [x0], #32 //AES block 8k+12, 8k+13 - load plaintext
+
+ ldp q14, q15, [x0], #32 //AES block 8k+14, 8k+15 - load plaintext
+ aese v2.16b, v28.16b //AES block 8k+10 - round 11
+ aese v1.16b, v27.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 10
+
+ rev32 v23.16b, v30.16b //CTR block 8k+18
+ aese v5.16b, v27.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 10
+
+ aese v3.16b, v27.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 10
+ pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low
+
+ aese v6.16b, v27.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 10
+ aese v5.16b, v28.16b //AES block 8k+13 - round 11
+ add v30.4s, v30.4s, v31.4s //CTR block 8k+18
+
+ aese v7.16b, v28.16b //AES block 8k+15 - round 11
+ aese v0.16b, v28.16b //AES block 8k+8 - round 11
+.inst 0xce04698c //eor3 v12.16b, v12.16b, v4.16b, v26.16b //AES block 4 - result
+
+ aese v6.16b, v28.16b //AES block 8k+14 - round 11
+ aese v3.16b, v28.16b //AES block 8k+11 - round 11
+ aese v1.16b, v28.16b //AES block 8k+9 - round 11
+
+ rev32 v25.16b, v30.16b //CTR block 8k+19
+ add v30.4s, v30.4s, v31.4s //CTR block 8k+19
+.inst 0xce0769ef //eor3 v15.16b, v15.16b, v7.16b, v26.16b //AES block 7 - result
+
+.inst 0xce02694a //eor3 v10.16b, v10.16b, v2.16b, v26.16b //AES block 8k+10 - result
+.inst 0xce006908 //eor3 v8.16b, v8.16b, v0.16b, v26.16b //AES block 8k+8 - result
+ mov v2.16b, v23.16b //CTR block 8k+18
+
+.inst 0xce016929 //eor3 v9.16b, v9.16b, v1.16b, v26.16b //AES block 8k+9 - result
+ mov v1.16b, v22.16b //CTR block 8k+17
+ stp q8, q9, [x2], #32 //AES block 8k+8, 8k+9 - store result
+ ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment
+
+.inst 0xce0669ce //eor3 v14.16b, v14.16b, v6.16b, v26.16b //AES block 6 - result
+ mov v0.16b, v20.16b //CTR block 8k+16
+ rev32 v4.16b, v30.16b //CTR block 8k+20
+
+ add v30.4s, v30.4s, v31.4s //CTR block 8k+20
+.inst 0xce0569ad //eor3 v13.16b, v13.16b, v5.16b, v26.16b //AES block 5 - result
+.inst 0xce115673 //eor3 v19.16b, v19.16b, v17.16b, v21.16b //MODULO - fold into low
+
+.inst 0xce03696b //eor3 v11.16b, v11.16b, v3.16b, v26.16b //AES block 8k+11 - result
+ mov v3.16b, v25.16b //CTR block 8k+19
+
+ stp q10, q11, [x2], #32 //AES block 8k+10, 8k+11 - store result
+
+ stp q12, q13, [x2], #32 //AES block 8k+12, 8k+13 - store result
+
+ cmp x0, x5 //.LOOP CONTROL
+ stp q14, q15, [x2], #32 //AES block 8k+14, 8k+15 - store result
+ b.lt .L192_enc_main_loop
+
+.L192_enc_prepretail: //PREPRETAIL
+ rev32 v5.16b, v30.16b //CTR block 8k+13
+ ldp q26, q27, [x8, #0] //load rk0, rk1
+ add v30.4s, v30.4s, v31.4s //CTR block 8k+13
+
+ ldr q23, [x3, #176] //load h7l | h7h
+ ext v23.16b, v23.16b, v23.16b, #8
+ ldr q25, [x3, #208] //load h8l | h8h
+ ext v25.16b, v25.16b, v25.16b, #8
+ rev64 v8.16b, v8.16b //GHASH block 8k
+ ext v19.16b, v19.16b, v19.16b, #8 //PRE 0
+
+ rev32 v6.16b, v30.16b //CTR block 8k+14
+ add v30.4s, v30.4s, v31.4s //CTR block 8k+14
+ ldr q21, [x3, #144] //load h6k | h5k
+ ldr q24, [x3, #192] //load h8k | h7k
+
+ rev64 v11.16b, v11.16b //GHASH block 8k+3
+ rev64 v10.16b, v10.16b //GHASH block 8k+2
+ ldr q20, [x3, #128] //load h5l | h5h
+ ext v20.16b, v20.16b, v20.16b, #8
+ ldr q22, [x3, #160] //load h6l | h6h
+ ext v22.16b, v22.16b, v22.16b, #8
+
+ eor v8.16b, v8.16b, v19.16b //PRE 1
+ rev32 v7.16b, v30.16b //CTR block 8k+15
+ rev64 v9.16b, v9.16b //GHASH block 8k+1
+
+ aese v5.16b, v26.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 0
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 0
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 0
+
+ pmull2 v16.1q, v9.2d, v23.2d //GHASH block 8k+1 - high
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 0
+ aese v6.16b, v26.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 0
+
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 0
+ aese v4.16b, v26.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 0
+ pmull2 v17.1q, v8.2d, v25.2d //GHASH block 8k - high
+
+ aese v6.16b, v27.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 1
+ pmull v19.1q, v8.1d, v25.1d //GHASH block 8k - low
+ trn1 v18.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
+
+ trn2 v8.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
+ aese v7.16b, v26.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 0
+ ldp q28, q26, [x8, #32] //load rk2, rk3
+
+ pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low
+ eor v17.16b, v17.16b, v16.16b //GHASH block 8k+1 - high
+ aese v2.16b, v27.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 1
+
+ aese v5.16b, v27.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 1
+ eor v8.16b, v8.16b, v18.16b //GHASH block 8k, 8k+1 - mid
+ aese v1.16b, v27.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 1
+
+ aese v7.16b, v27.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 1
+ pmull2 v9.1q, v11.2d, v20.2d //GHASH block 8k+3 - high
+ pmull2 v29.1q, v10.2d, v22.2d //GHASH block 8k+2 - high
+
+ aese v3.16b, v27.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 1
+ aese v0.16b, v27.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 1
+ aese v4.16b, v27.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 1
+
+ pmull v22.1q, v10.1d, v22.1d //GHASH block 8k+2 - low
+ aese v5.16b, v28.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 2
+ eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low
+
+ pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low
+ aese v7.16b, v28.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 2
+.inst 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b, v9.16b //GHASH block 8k+2, 8k+3 - high
+
+ aese v5.16b, v26.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 3
+ trn1 v29.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
+ aese v6.16b, v28.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 2
+
+ aese v0.16b, v28.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 2
+ pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid
+ trn2 v10.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
+
+ aese v3.16b, v28.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 2
+ rev64 v13.16b, v13.16b //GHASH block 8k+5 (t0, t1, t2 and t3 free)
+ rev64 v14.16b, v14.16b //GHASH block 8k+6 (t0, t1, and t2 free)
+
+ aese v2.16b, v28.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 2
+ aese v1.16b, v28.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 2
+ aese v4.16b, v28.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 2
+
+ eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
+ pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid
+ ldp q27, q28, [x8, #64] //load rk4, rk5
+
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 3
+ aese v6.16b, v26.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 3
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 3
+
+ eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid
+.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low
+ aese v7.16b, v26.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 3
+
+ ldr q23, [x3, #80] //load h3l | h3h
+ ext v23.16b, v23.16b, v23.16b, #8
+ ldr q25, [x3, #112] //load h4l | h4h
+ ext v25.16b, v25.16b, v25.16b, #8
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 3
+ pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid
+
+ ldr q20, [x3, #32] //load h1l | h1h
+ ext v20.16b, v20.16b, v20.16b, #8
+ ldr q22, [x3, #64] //load h2l | h2h
+ ext v22.16b, v22.16b, v22.16b, #8
+ aese v4.16b, v26.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 3
+ rev64 v12.16b, v12.16b //GHASH block 8k+4 (t0, t1, and t2 free)
+
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 3
+ pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid
+ aese v6.16b, v27.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 4
+
+ trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
+ aese v7.16b, v27.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 4
+ aese v5.16b, v27.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 4
+
+.inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
+ aese v3.16b, v27.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 4
+ aese v0.16b, v27.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 4
+
+ aese v1.16b, v27.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 4
+ aese v4.16b, v27.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 4
+ aese v2.16b, v27.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 4
+
+ aese v0.16b, v28.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 5
+ rev64 v15.16b, v15.16b //GHASH block 8k+7 (t0, t1, t2 and t3 free)
+ ldr q21, [x3, #48] //load h2k | h1k
+ ldr q24, [x3, #96] //load h4k | h3k
+
+ aese v1.16b, v28.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 5
+ aese v2.16b, v28.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 5
+ ldp q26, q27, [x8, #96] //load rk6, rk7
+
+ pmull2 v11.1q, v14.2d, v22.2d //GHASH block 8k+6 - high
+ pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high
+ pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low
+
+ aese v4.16b, v28.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 5
+ trn2 v12.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
+
+ pmull2 v10.1q, v13.2d, v23.2d //GHASH block 8k+5 - high
+ pmull v23.1q, v13.1d, v23.1d //GHASH block 8k+5 - low
+ pmull v22.1q, v14.1d, v22.1d //GHASH block 8k+6 - low
+
+ trn1 v13.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
+ eor v12.16b, v12.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
+ trn2 v14.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
+
+ aese v5.16b, v28.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 5
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 6
+ aese v7.16b, v28.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 5
+
+ aese v6.16b, v28.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 5
+ eor v14.16b, v14.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
+ aese v3.16b, v28.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 5
+
+ pmull2 v16.1q, v12.2d, v24.2d //GHASH block 8k+4 - mid
+ pmull v24.1q, v12.1d, v24.1d //GHASH block 8k+5 - mid
+
+ aese v4.16b, v26.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 6
+ aese v5.16b, v26.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 6
+ aese v1.16b, v27.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 7
+
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 6
+ aese v7.16b, v26.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 6
+.inst 0xce184252 //eor3 v18.16b, v18.16b, v24.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
+
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 6
+.inst 0xce082a31 //eor3 v17.16b, v17.16b, v8.16b, v10.16b //GHASH block 8k+4, 8k+5 - high
+ aese v5.16b, v27.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 7
+
+ aese v6.16b, v26.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 6
+ ldr d16, [x10] //MODULO - load modulo constant
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 6
+
+ pmull2 v13.1q, v14.2d, v21.2d //GHASH block 8k+6 - mid
+ aese v0.16b, v27.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 7
+.inst 0xce195e73 //eor3 v19.16b, v19.16b, v25.16b, v23.16b //GHASH block 8k+4, 8k+5 - low
+
+ pmull2 v12.1q, v15.2d, v20.2d //GHASH block 8k+7 - high
+ pmull v21.1q, v14.1d, v21.1d //GHASH block 8k+7 - mid
+ pmull v20.1q, v15.1d, v20.1d //GHASH block 8k+7 - low
+
+ aese v4.16b, v27.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 7
+ aese v2.16b, v27.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 7
+ ldp q28, q26, [x8, #128] //load rk8, rk9
+
+ aese v3.16b, v27.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 7
+.inst 0xce153652 //eor3 v18.16b, v18.16b, v21.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
+
+.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+6, 8k+7 - low
+.inst 0xce0b3231 //eor3 v17.16b, v17.16b, v11.16b, v12.16b //GHASH block 8k+6, 8k+7 - high
+
+.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up
+ ext v29.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment
+ aese v7.16b, v27.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 7
+ pmull v21.1q, v17.1d, v16.1d //MODULO - top 64b align with mid
+
+ aese v5.16b, v28.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 8
+ aese v1.16b, v28.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 8
+
+ aese v6.16b, v27.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 7
+ aese v2.16b, v28.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 8
+.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid
+
+ aese v3.16b, v28.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 8
+ aese v5.16b, v26.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 9
+ aese v4.16b, v28.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 8
+
+ aese v0.16b, v28.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 8
+ aese v7.16b, v28.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 8
+ aese v6.16b, v28.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 8
+
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 9
+ ldp q27, q28, [x8, #160] //load rk10, rk11
+ aese v4.16b, v26.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 9
+
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 9
+ aese v7.16b, v26.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 9
+
+ ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment
+ aese v6.16b, v26.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 9
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 9
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 9
+
+ pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low
+ ldr q26, [x8, #192] //load rk12
+
+ aese v7.16b, v27.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 10
+ aese v1.16b, v27.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 10
+ aese v2.16b, v27.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 10
+
+.inst 0xce115673 //eor3 v19.16b, v19.16b, v17.16b, v21.16b //MODULO - fold into low
+ aese v0.16b, v27.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 10
+ aese v3.16b, v27.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 10
+
+ aese v1.16b, v28.16b //AES block 8k+9 - round 11
+ aese v7.16b, v28.16b //AES block 8k+15 - round 11
+
+ aese v4.16b, v27.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 10
+ aese v3.16b, v28.16b //AES block 8k+11 - round 11
+
+ aese v5.16b, v27.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 10
+ aese v6.16b, v27.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 10
+
+ add v30.4s, v30.4s, v31.4s //CTR block 8k+15
+ aese v2.16b, v28.16b //AES block 8k+10 - round 11
+ aese v0.16b, v28.16b //AES block 8k+8 - round 11
+
+ aese v6.16b, v28.16b //AES block 8k+14 - round 11
+ aese v4.16b, v28.16b //AES block 8k+12 - round 11
+ aese v5.16b, v28.16b //AES block 8k+13 - round 11
+
+.L192_enc_tail: //TAIL
+
+ ldp q20, q21, [x3, #128] //load h5l | h5h
+ ext v20.16b, v20.16b, v20.16b, #8
+ sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process
+
+ ldr q8, [x0], #16 //AES block 8k+8 - l3ad plaintext
+
+ ldp q24, q25, [x3, #192] //load h8k | h7k
+ ext v25.16b, v25.16b, v25.16b, #8
+
+ mov v29.16b, v26.16b
+
+ ldp q22, q23, [x3, #160] //load h6l | h6h
+ ext v22.16b, v22.16b, v22.16b, #8
+ ext v23.16b, v23.16b, v23.16b, #8
+ cmp x5, #112
+
+.inst 0xce007509 //eor3 v9.16b, v8.16b, v0.16b, v29.16b //AES block 8k+8 - result
+ ext v16.16b, v19.16b, v19.16b, #8 //prepare final partial tag
+ b.gt .L192_enc_blocks_more_than_7
+
+ cmp x5, #96
+ mov v7.16b, v6.16b
+ movi v17.8b, #0
+
+ mov v6.16b, v5.16b
+ movi v19.8b, #0
+ sub v30.4s, v30.4s, v31.4s
+
+ mov v5.16b, v4.16b
+ mov v4.16b, v3.16b
+ mov v3.16b, v2.16b
+
+ mov v2.16b, v1.16b
+ movi v18.8b, #0
+ b.gt .L192_enc_blocks_more_than_6
+
+ mov v7.16b, v6.16b
+ cmp x5, #80
+
+ mov v6.16b, v5.16b
+ mov v5.16b, v4.16b
+ mov v4.16b, v3.16b
+
+ mov v3.16b, v1.16b
+ sub v30.4s, v30.4s, v31.4s
+ b.gt .L192_enc_blocks_more_than_5
+
+ cmp x5, #64
+ sub v30.4s, v30.4s, v31.4s
+
+ mov v7.16b, v6.16b
+ mov v6.16b, v5.16b
+ mov v5.16b, v4.16b
+
+ mov v4.16b, v1.16b
+ b.gt .L192_enc_blocks_more_than_4
+
+ mov v7.16b, v6.16b
+ mov v6.16b, v5.16b
+ mov v5.16b, v1.16b
+
+ sub v30.4s, v30.4s, v31.4s
+ cmp x5, #48
+ b.gt .L192_enc_blocks_more_than_3
+
+ mov v7.16b, v6.16b
+ mov v6.16b, v1.16b
+ sub v30.4s, v30.4s, v31.4s
+
+ ldr q24, [x3, #96] //load h4k | h3k
+ cmp x5, #32
+ b.gt .L192_enc_blocks_more_than_2
+
+ sub v30.4s, v30.4s, v31.4s
+
+ cmp x5, #16
+ mov v7.16b, v1.16b
+ b.gt .L192_enc_blocks_more_than_1
+
+ sub v30.4s, v30.4s, v31.4s
+ ldr q21, [x3, #48] //load h2k | h1k
+ b .L192_enc_blocks_less_than_1
+.L192_enc_blocks_more_than_7: //blocks left > 7
+ st1 { v9.16b}, [x2], #16 //AES final-7 block - store result
+
+ rev64 v8.16b, v9.16b //GHASH final-7 block
+ ins v18.d[0], v24.d[1] //GHASH final-7 block - mid
+
+ eor v8.16b, v8.16b, v16.16b //feed in partial tag
+
+ ins v27.d[0], v8.d[1] //GHASH final-7 block - mid
+
+ ldr q9, [x0], #16 //AES final-6 block - load plaintext
+
+ eor v27.8b, v27.8b, v8.8b //GHASH final-7 block - mid
+ movi v16.8b, #0 //suppress further partial tag feed in
+ pmull v19.1q, v8.1d, v25.1d //GHASH final-7 block - low
+
+ pmull2 v17.1q, v8.2d, v25.2d //GHASH final-7 block - high
+
+ pmull v18.1q, v27.1d, v18.1d //GHASH final-7 block - mid
+.inst 0xce017529 //eor3 v9.16b, v9.16b, v1.16b, v29.16b //AES final-6 block - result
+.L192_enc_blocks_more_than_6: //blocks left > 6
+
+ st1 { v9.16b}, [x2], #16 //AES final-6 block - store result
+
+ rev64 v8.16b, v9.16b //GHASH final-6 block
+
+ ldr q9, [x0], #16 //AES final-5 block - load plaintext
+
+ eor v8.16b, v8.16b, v16.16b //feed in partial tag
+
+ ins v27.d[0], v8.d[1] //GHASH final-6 block - mid
+
+ pmull v26.1q, v8.1d, v23.1d //GHASH final-6 block - low
+.inst 0xce027529 //eor3 v9.16b, v9.16b, v2.16b, v29.16b //AES final-5 block - result
+
+ movi v16.8b, #0 //suppress further partial tag feed in
+ pmull2 v28.1q, v8.2d, v23.2d //GHASH final-6 block - high
+ eor v27.8b, v27.8b, v8.8b //GHASH final-6 block - mid
+
+ pmull v27.1q, v27.1d, v24.1d //GHASH final-6 block - mid
+
+ eor v17.16b, v17.16b, v28.16b //GHASH final-6 block - high
+ eor v19.16b, v19.16b, v26.16b //GHASH final-6 block - low
+
+ eor v18.16b, v18.16b, v27.16b //GHASH final-6 block - mid
+.L192_enc_blocks_more_than_5: //blocks left > 5
+
+ st1 { v9.16b}, [x2], #16 //AES final-5 block - store result
+
+ rev64 v8.16b, v9.16b //GHASH final-5 block
+
+ eor v8.16b, v8.16b, v16.16b //feed in partial tag
+
+ ins v27.d[0], v8.d[1] //GHASH final-5 block - mid
+
+ ldr q9, [x0], #16 //AES final-4 block - load plaintext
+ pmull2 v28.1q, v8.2d, v22.2d //GHASH final-5 block - high
+
+ eor v27.8b, v27.8b, v8.8b //GHASH final-5 block - mid
+ eor v17.16b, v17.16b, v28.16b //GHASH final-5 block - high
+
+ ins v27.d[1], v27.d[0] //GHASH final-5 block - mid
+ pmull v26.1q, v8.1d, v22.1d //GHASH final-5 block - low
+
+ eor v19.16b, v19.16b, v26.16b //GHASH final-5 block - low
+ pmull2 v27.1q, v27.2d, v21.2d //GHASH final-5 block - mid
+
+.inst 0xce037529 //eor3 v9.16b, v9.16b, v3.16b, v29.16b //AES final-4 block - result
+ movi v16.8b, #0 //suppress further partial tag feed in
+
+ eor v18.16b, v18.16b, v27.16b //GHASH final-5 block - mid
+.L192_enc_blocks_more_than_4: //blocks left > 4
+
+ st1 { v9.16b}, [x2], #16 //AES final-4 block - store result
+
+ rev64 v8.16b, v9.16b //GHASH final-4 block
+
+ eor v8.16b, v8.16b, v16.16b //feed in partial tag
+
+ ldr q9, [x0], #16 //AES final-3 block - load plaintext
+ pmull2 v28.1q, v8.2d, v20.2d //GHASH final-4 block - high
+ ins v27.d[0], v8.d[1] //GHASH final-4 block - mid
+
+ pmull v26.1q, v8.1d, v20.1d //GHASH final-4 block - low
+ eor v17.16b, v17.16b, v28.16b //GHASH final-4 block - high
+
+ eor v27.8b, v27.8b, v8.8b //GHASH final-4 block - mid
+
+ movi v16.8b, #0 //suppress further partial tag feed in
+ eor v19.16b, v19.16b, v26.16b //GHASH final-4 block - low
+
+ pmull v27.1q, v27.1d, v21.1d //GHASH final-4 block - mid
+
+ eor v18.16b, v18.16b, v27.16b //GHASH final-4 block - mid
+.inst 0xce047529 //eor3 v9.16b, v9.16b, v4.16b, v29.16b //AES final-3 block - result
+.L192_enc_blocks_more_than_3: //blocks left > 3
+
+ ldr q24, [x3, #96] //load h4k | h3k
+ st1 { v9.16b}, [x2], #16 //AES final-3 block - store result
+
+ rev64 v8.16b, v9.16b //GHASH final-3 block
+
+ eor v8.16b, v8.16b, v16.16b //feed in partial tag
+ movi v16.8b, #0 //suppress further partial tag feed in
+
+ ldr q9, [x0], #16 //AES final-2 block - load plaintext
+ ldr q25, [x3, #112] //load h4l | h4h
+ ext v25.16b, v25.16b, v25.16b, #8
+
+ ins v27.d[0], v8.d[1] //GHASH final-3 block - mid
+
+.inst 0xce057529 //eor3 v9.16b, v9.16b, v5.16b, v29.16b //AES final-2 block - result
+ eor v27.8b, v27.8b, v8.8b //GHASH final-3 block - mid
+
+ ins v27.d[1], v27.d[0] //GHASH final-3 block - mid
+ pmull v26.1q, v8.1d, v25.1d //GHASH final-3 block - low
+
+ pmull2 v28.1q, v8.2d, v25.2d //GHASH final-3 block - high
+ pmull2 v27.1q, v27.2d, v24.2d //GHASH final-3 block - mid
+
+ eor v19.16b, v19.16b, v26.16b //GHASH final-3 block - low
+
+ eor v18.16b, v18.16b, v27.16b //GHASH final-3 block - mid
+ eor v17.16b, v17.16b, v28.16b //GHASH final-3 block - high
+.L192_enc_blocks_more_than_2: //blocks left > 2
+
+ st1 { v9.16b}, [x2], #16 //AES final-2 block - store result
+
+ rev64 v8.16b, v9.16b //GHASH final-2 block
+ ldr q23, [x3, #80] //load h3l | h3h
+ ext v23.16b, v23.16b, v23.16b, #8
+
+ eor v8.16b, v8.16b, v16.16b //feed in partial tag
+
+ ldr q9, [x0], #16 //AES final-1 block - load plaintext
+ ins v27.d[0], v8.d[1] //GHASH final-2 block - mid
+
+ eor v27.8b, v27.8b, v8.8b //GHASH final-2 block - mid
+
+ pmull v26.1q, v8.1d, v23.1d //GHASH final-2 block - low
+ pmull2 v28.1q, v8.2d, v23.2d //GHASH final-2 block - high
+ movi v16.8b, #0 //suppress further partial tag feed in
+
+ pmull v27.1q, v27.1d, v24.1d //GHASH final-2 block - mid
+
+ eor v19.16b, v19.16b, v26.16b //GHASH final-2 block - low
+ eor v17.16b, v17.16b, v28.16b //GHASH final-2 block - high
+
+ eor v18.16b, v18.16b, v27.16b //GHASH final-2 block - mid
+.inst 0xce067529 //eor3 v9.16b, v9.16b, v6.16b, v29.16b //AES final-1 block - result
+.L192_enc_blocks_more_than_1: //blocks left > 1
+
+ ldr q22, [x3, #64] //load h1l | h1h
+ ext v22.16b, v22.16b, v22.16b, #8
+ st1 { v9.16b}, [x2], #16 //AES final-1 block - store result
+
+ rev64 v8.16b, v9.16b //GHASH final-1 block
+
+ eor v8.16b, v8.16b, v16.16b //feed in partial tag
+
+ ins v27.d[0], v8.d[1] //GHASH final-1 block - mid
+ pmull v26.1q, v8.1d, v22.1d //GHASH final-1 block - low
+
+ eor v19.16b, v19.16b, v26.16b //GHASH final-1 block - low
+ pmull2 v28.1q, v8.2d, v22.2d //GHASH final-1 block - high
+ eor v27.8b, v27.8b, v8.8b //GHASH final-1 block - mid
+
+ ldr q9, [x0], #16 //AES final block - load plaintext
+ ldr q21, [x3, #48] //load h2k | h1k
+
+ ins v27.d[1], v27.d[0] //GHASH final-1 block - mid
+
+.inst 0xce077529 //eor3 v9.16b, v9.16b, v7.16b, v29.16b //AES final block - result
+ pmull2 v27.1q, v27.2d, v21.2d //GHASH final-1 block - mid
+
+ movi v16.8b, #0 //suppress further partial tag feed in
+
+ eor v18.16b, v18.16b, v27.16b //GHASH final-1 block - mid
+ eor v17.16b, v17.16b, v28.16b //GHASH final-1 block - high
+.L192_enc_blocks_less_than_1: //blocks left <= 1
+
+ mvn x6, xzr //temp0_x = 0xffffffffffffffff
+ and x1, x1, #127 //bit_length %= 128
+
+ sub x1, x1, #128 //bit_length -= 128
+
+ neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128])
+
+ and x1, x1, #127 //bit_length %= 128
+
+ lsr x6, x6, x1 //temp0_x is mask for top 64b of last block
+ cmp x1, #64
+ mvn x7, xzr //temp1_x = 0xffffffffffffffff
+
+ csel x13, x7, x6, lt
+ csel x14, x6, xzr, lt
+
+ mov v0.d[1], x14
+ ldr q20, [x3, #32] //load h1l | h1h
+ ext v20.16b, v20.16b, v20.16b, #8
+
+ ld1 { v26.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored
+ mov v0.d[0], x13 //ctr0b is mask for last block
+
+ and v9.16b, v9.16b, v0.16b //possibly partial last block has zeroes in highest bits
+
+ rev64 v8.16b, v9.16b //GHASH final block
+ bif v9.16b, v26.16b, v0.16b //insert existing bytes in top end of result before storing
+
+ st1 { v9.16b}, [x2] //store all 16B
+
+ eor v8.16b, v8.16b, v16.16b //feed in partial tag
+
+ ins v16.d[0], v8.d[1] //GHASH final block - mid
+ pmull2 v28.1q, v8.2d, v20.2d //GHASH final block - high
+
+ eor v17.16b, v17.16b, v28.16b //GHASH final block - high
+ pmull v26.1q, v8.1d, v20.1d //GHASH final block - low
+
+ eor v16.8b, v16.8b, v8.8b //GHASH final block - mid
+
+ pmull v16.1q, v16.1d, v21.1d //GHASH final block - mid
+
+ eor v18.16b, v18.16b, v16.16b //GHASH final block - mid
+ ldr d16, [x10] //MODULO - load modulo constant
+
+ eor v19.16b, v19.16b, v26.16b //GHASH final block - low
+ ext v21.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment
+
+ rev32 v30.16b, v30.16b
+
+ str q30, [x16] //store the updated counter
+.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up
+
+ pmull v29.1q, v17.1d, v16.1d //MODULO - top 64b align with mid
+
+.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid
+
+ pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low
+ ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment
+
+.inst 0xce115673 //eor3 v19.16b, v19.16b, v17.16b, v21.16b //MODULO - fold into low
+ ext v19.16b, v19.16b, v19.16b, #8
+ rev64 v19.16b, v19.16b
+ st1 { v19.16b }, [x3]
+
+ mov x0, x9 //return sizes
+
+ ldp d10, d11, [sp, #16]
+ ldp d12, d13, [sp, #32]
+ ldp d14, d15, [sp, #48]
+ ldp d8, d9, [sp], #80
+ ret
+
+.L192_enc_ret:
+ mov w0, #0x0
+ ret
+.size unroll8_eor3_aes_gcm_enc_192_kernel,.-unroll8_eor3_aes_gcm_enc_192_kernel
+.globl unroll8_eor3_aes_gcm_dec_192_kernel
+.type unroll8_eor3_aes_gcm_dec_192_kernel,%function
+.align 4
+unroll8_eor3_aes_gcm_dec_192_kernel:
+ AARCH64_VALID_CALL_TARGET
+ cbz x1, .L192_dec_ret
+ stp d8, d9, [sp, #-80]!
+ lsr x9, x1, #3
+ mov x16, x4
+ mov x8, x5
+ stp d10, d11, [sp, #16]
+ stp d12, d13, [sp, #32]
+ stp d14, d15, [sp, #48]
+ mov x5, #0xc200000000000000
+ stp x5, xzr, [sp, #64]
+ add x10, sp, #64
+
+ mov x5, x9
+ ld1 { v0.16b}, [x16] //CTR block 0
+ ld1 { v19.16b}, [x3]
+
+ mov x15, #0x100000000 //set up counter increment
+ movi v31.16b, #0x0
+ mov v31.d[1], x15
+
+ rev32 v30.16b, v0.16b //set up reversed counter
+
+ add v30.4s, v30.4s, v31.4s //CTR block 0
+
+ rev32 v1.16b, v30.16b //CTR block 1
+ add v30.4s, v30.4s, v31.4s //CTR block 1
+
+ rev32 v2.16b, v30.16b //CTR block 2
+ add v30.4s, v30.4s, v31.4s //CTR block 2
+
+ rev32 v3.16b, v30.16b //CTR block 3
+ add v30.4s, v30.4s, v31.4s //CTR block 3
+
+ rev32 v4.16b, v30.16b //CTR block 4
+ add v30.4s, v30.4s, v31.4s //CTR block 4
+
+ rev32 v5.16b, v30.16b //CTR block 5
+ add v30.4s, v30.4s, v31.4s //CTR block 5
+ ldp q26, q27, [x8, #0] //load rk0, rk1
+
+ rev32 v6.16b, v30.16b //CTR block 6
+ add v30.4s, v30.4s, v31.4s //CTR block 6
+
+ rev32 v7.16b, v30.16b //CTR block 7
+
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b //AES block 3 - round 0
+ aese v6.16b, v26.16b
+ aesmc v6.16b, v6.16b //AES block 6 - round 0
+ aese v5.16b, v26.16b
+ aesmc v5.16b, v5.16b //AES block 5 - round 0
+
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b //AES block 0 - round 0
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b //AES block 1 - round 0
+ aese v7.16b, v26.16b
+ aesmc v7.16b, v7.16b //AES block 7 - round 0
+
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b //AES block 2 - round 0
+ aese v4.16b, v26.16b
+ aesmc v4.16b, v4.16b //AES block 4 - round 0
+ ldp q28, q26, [x8, #32] //load rk2, rk3
+
+ aese v1.16b, v27.16b
+ aesmc v1.16b, v1.16b //AES block 1 - round 1
+
+ aese v2.16b, v27.16b
+ aesmc v2.16b, v2.16b //AES block 2 - round 1
+
+ aese v0.16b, v27.16b
+ aesmc v0.16b, v0.16b //AES block 0 - round 1
+ aese v3.16b, v27.16b
+ aesmc v3.16b, v3.16b //AES block 3 - round 1
+ aese v7.16b, v27.16b
+ aesmc v7.16b, v7.16b //AES block 7 - round 1
+
+ aese v5.16b, v27.16b
+ aesmc v5.16b, v5.16b //AES block 5 - round 1
+ aese v6.16b, v27.16b
+ aesmc v6.16b, v6.16b //AES block 6 - round 1
+
+ aese v7.16b, v28.16b
+ aesmc v7.16b, v7.16b //AES block 7 - round 2
+ aese v0.16b, v28.16b
+ aesmc v0.16b, v0.16b //AES block 0 - round 2
+ aese v4.16b, v27.16b
+ aesmc v4.16b, v4.16b //AES block 4 - round 1
+
+ aese v5.16b, v28.16b
+ aesmc v5.16b, v5.16b //AES block 5 - round 2
+ aese v1.16b, v28.16b
+ aesmc v1.16b, v1.16b //AES block 1 - round 2
+ aese v2.16b, v28.16b
+ aesmc v2.16b, v2.16b //AES block 2 - round 2
+
+ aese v3.16b, v28.16b
+ aesmc v3.16b, v3.16b //AES block 3 - round 2
+ aese v4.16b, v28.16b
+ aesmc v4.16b, v4.16b //AES block 4 - round 2
+ aese v6.16b, v28.16b
+ aesmc v6.16b, v6.16b //AES block 6 - round 2
+
+ aese v7.16b, v26.16b
+ aesmc v7.16b, v7.16b //AES block 7 - round 3
+
+ ldp q27, q28, [x8, #64] //load rk4, rk5
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b //AES block 2 - round 3
+ aese v5.16b, v26.16b
+ aesmc v5.16b, v5.16b //AES block 5 - round 3
+
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b //AES block 0 - round 3
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b //AES block 3 - round 3
+
+ aese v4.16b, v26.16b
+ aesmc v4.16b, v4.16b //AES block 4 - round 3
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b //AES block 1 - round 3
+ aese v6.16b, v26.16b
+ aesmc v6.16b, v6.16b //AES block 6 - round 3
+
+ aese v3.16b, v27.16b
+ aesmc v3.16b, v3.16b //AES block 3 - round 4
+ aese v2.16b, v27.16b
+ aesmc v2.16b, v2.16b //AES block 2 - round 4
+ aese v5.16b, v27.16b
+ aesmc v5.16b, v5.16b //AES block 5 - round 4
+
+ aese v1.16b, v27.16b
+ aesmc v1.16b, v1.16b //AES block 1 - round 4
+ aese v7.16b, v27.16b
+ aesmc v7.16b, v7.16b //AES block 7 - round 4
+ aese v6.16b, v27.16b
+ aesmc v6.16b, v6.16b //AES block 6 - round 4
+
+ aese v0.16b, v27.16b
+ aesmc v0.16b, v0.16b //AES block 0 - round 4
+ aese v5.16b, v28.16b
+ aesmc v5.16b, v5.16b //AES block 5 - round 5
+ aese v4.16b, v27.16b
+ aesmc v4.16b, v4.16b //AES block 4 - round 4
+
+ aese v6.16b, v28.16b
+ aesmc v6.16b, v6.16b //AES block 6 - round 5
+ ldp q26, q27, [x8, #96] //load rk6, rk7
+
+ aese v0.16b, v28.16b
+ aesmc v0.16b, v0.16b //AES block 0 - round 5
+ aese v4.16b, v28.16b
+ aesmc v4.16b, v4.16b //AES block 4 - round 5
+ aese v1.16b, v28.16b
+ aesmc v1.16b, v1.16b //AES block 1 - round 5
+
+ aese v3.16b, v28.16b
+ aesmc v3.16b, v3.16b //AES block 3 - round 5
+ aese v2.16b, v28.16b
+ aesmc v2.16b, v2.16b //AES block 2 - round 5
+ aese v7.16b, v28.16b
+ aesmc v7.16b, v7.16b //AES block 7 - round 5
+
+ sub x5, x5, #1 //byte_len - 1
+
+ aese v4.16b, v26.16b
+ aesmc v4.16b, v4.16b //AES block 4 - round 6
+ aese v5.16b, v26.16b
+ aesmc v5.16b, v5.16b //AES block 5 - round 6
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b //AES block 1 - round 6
+
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b //AES block 0 - round 6
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b //AES block 3 - round 6
+ aese v6.16b, v26.16b
+ aesmc v6.16b, v6.16b //AES block 6 - round 6
+
+ aese v7.16b, v26.16b
+ aesmc v7.16b, v7.16b //AES block 7 - round 6
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b //AES block 2 - round 6
+ ldp q28, q26, [x8, #128] //load rk8, rk9
+
+ add v30.4s, v30.4s, v31.4s //CTR block 7
+
+ aese v3.16b, v27.16b
+ aesmc v3.16b, v3.16b //AES block 3 - round 7
+ aese v7.16b, v27.16b
+ aesmc v7.16b, v7.16b //AES block 7 - round 7
+
+ aese v2.16b, v27.16b
+ aesmc v2.16b, v2.16b //AES block 2 - round 7
+ aese v1.16b, v27.16b
+ aesmc v1.16b, v1.16b //AES block 1 - round 7
+ aese v4.16b, v27.16b
+ aesmc v4.16b, v4.16b //AES block 4 - round 7
+
+ aese v6.16b, v27.16b
+ aesmc v6.16b, v6.16b //AES block 6 - round 7
+ aese v0.16b, v27.16b
+ aesmc v0.16b, v0.16b //AES block 0 - round 7
+ aese v5.16b, v27.16b
+ aesmc v5.16b, v5.16b //AES block 5 - round 7
+
+ aese v1.16b, v28.16b
+ aesmc v1.16b, v1.16b //AES block 1 - round 8
+ aese v2.16b, v28.16b
+ aesmc v2.16b, v2.16b //AES block 2 - round 8
+ and x5, x5, #0xffffffffffffff80 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
+
+ aese v7.16b, v28.16b
+ aesmc v7.16b, v7.16b //AES block 7 - round 8
+ aese v6.16b, v28.16b
+ aesmc v6.16b, v6.16b //AES block 6 - round 8
+ aese v5.16b, v28.16b
+ aesmc v5.16b, v5.16b //AES block 5 - round 8
+
+ aese v4.16b, v28.16b
+ aesmc v4.16b, v4.16b //AES block 4 - round 8
+ aese v3.16b, v28.16b
+ aesmc v3.16b, v3.16b //AES block 3 - round 8
+ aese v0.16b, v28.16b
+ aesmc v0.16b, v0.16b //AES block 0 - round 8
+
+ add x4, x0, x1, lsr #3 //end_input_ptr
+ aese v6.16b, v26.16b
+ aesmc v6.16b, v6.16b //AES block 6 - round 9
+
+ ld1 { v19.16b}, [x3]
+ ext v19.16b, v19.16b, v19.16b, #8
+ rev64 v19.16b, v19.16b
+
+ ldp q27, q28, [x8, #160] //load rk10, rk11
+
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b //AES block 0 - round 9
+ add x5, x5, x0
+
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b //AES block 1 - round 9
+ aese v7.16b, v26.16b
+ aesmc v7.16b, v7.16b //AES block 7 - round 9
+ aese v4.16b, v26.16b
+ aesmc v4.16b, v4.16b //AES block 4 - round 9
+
+ cmp x0, x5 //check if we have <= 8 blocks
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b //AES block 3 - round 9
+
+ aese v5.16b, v26.16b
+ aesmc v5.16b, v5.16b //AES block 5 - round 9
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b //AES block 2 - round 9
+
+ aese v3.16b, v27.16b
+ aesmc v3.16b, v3.16b //AES block 3 - round 10
+ aese v1.16b, v27.16b
+ aesmc v1.16b, v1.16b //AES block 1 - round 10
+ aese v7.16b, v27.16b
+ aesmc v7.16b, v7.16b //AES block 7 - round 10
+
+ aese v4.16b, v27.16b
+ aesmc v4.16b, v4.16b //AES block 4 - round 10
+ aese v0.16b, v27.16b
+ aesmc v0.16b, v0.16b //AES block 0 - round 10
+ aese v2.16b, v27.16b
+ aesmc v2.16b, v2.16b //AES block 2 - round 10
+
+ aese v6.16b, v27.16b
+ aesmc v6.16b, v6.16b //AES block 6 - round 10
+ aese v5.16b, v27.16b
+ aesmc v5.16b, v5.16b //AES block 5 - round 10
+ ldr q26, [x8, #192] //load rk12
+
+ aese v0.16b, v28.16b //AES block 0 - round 11
+ aese v1.16b, v28.16b //AES block 1 - round 11
+ aese v4.16b, v28.16b //AES block 4 - round 11
+
+ aese v6.16b, v28.16b //AES block 6 - round 11
+ aese v5.16b, v28.16b //AES block 5 - round 11
+ aese v7.16b, v28.16b //AES block 7 - round 11
+
+ aese v2.16b, v28.16b //AES block 2 - round 11
+ aese v3.16b, v28.16b //AES block 3 - round 11
+ b.ge .L192_dec_tail //handle tail
+
+ ldp q8, q9, [x0], #32 //AES block 0, 1 - load ciphertext
+
+ ldp q10, q11, [x0], #32 //AES block 2, 3 - load ciphertext
+
+ ldp q12, q13, [x0], #32 //AES block 4, 5 - load ciphertext
+
+.inst 0xce016921 //eor3 v1.16b, v9.16b, v1.16b, v26.16b //AES block 1 - result
+.inst 0xce006900 //eor3 v0.16b, v8.16b, v0.16b, v26.16b //AES block 0 - result
+ stp q0, q1, [x2], #32 //AES block 0, 1 - store result
+
+ rev32 v0.16b, v30.16b //CTR block 8
+ add v30.4s, v30.4s, v31.4s //CTR block 8
+
+ rev32 v1.16b, v30.16b //CTR block 9
+ add v30.4s, v30.4s, v31.4s //CTR block 9
+.inst 0xce036963 //eor3 v3.16b, v11.16b, v3.16b, v26.16b //AES block 3 - result
+
+.inst 0xce026942 //eor3 v2.16b, v10.16b, v2.16b, v26.16b //AES block 2 - result
+ stp q2, q3, [x2], #32 //AES block 2, 3 - store result
+ ldp q14, q15, [x0], #32 //AES block 6, 7 - load ciphertext
+
+ rev32 v2.16b, v30.16b //CTR block 10
+ add v30.4s, v30.4s, v31.4s //CTR block 10
+
+.inst 0xce046984 //eor3 v4.16b, v12.16b, v4.16b, v26.16b //AES block 4 - result
+
+ rev32 v3.16b, v30.16b //CTR block 11
+ add v30.4s, v30.4s, v31.4s //CTR block 11
+
+.inst 0xce0569a5 //eor3 v5.16b, v13.16b, v5.16b, v26.16b //AES block 5 - result
+ stp q4, q5, [x2], #32 //AES block 4, 5 - store result
+ cmp x0, x5 //check if we have <= 8 blocks
+
+.inst 0xce0669c6 //eor3 v6.16b, v14.16b, v6.16b, v26.16b //AES block 6 - result
+.inst 0xce0769e7 //eor3 v7.16b, v15.16b, v7.16b, v26.16b //AES block 7 - result
+ rev32 v4.16b, v30.16b //CTR block 12
+
+ add v30.4s, v30.4s, v31.4s //CTR block 12
+ stp q6, q7, [x2], #32 //AES block 6, 7 - store result
+ b.ge .L192_dec_prepretail //do prepretail
+
+.L192_dec_main_loop: //main loop start
+ rev64 v9.16b, v9.16b //GHASH block 8k+1
+ ldp q26, q27, [x8, #0] //load rk0, rk1
+ ext v19.16b, v19.16b, v19.16b, #8 //PRE 0
+
+ rev64 v8.16b, v8.16b //GHASH block 8k
+ rev32 v5.16b, v30.16b //CTR block 8k+13
+ add v30.4s, v30.4s, v31.4s //CTR block 8k+13
+
+ ldr q23, [x3, #176] //load h7l | h7h
+ ext v23.16b, v23.16b, v23.16b, #8
+ ldr q25, [x3, #208] //load h8l | h8h
+ ext v25.16b, v25.16b, v25.16b, #8
+ rev64 v12.16b, v12.16b //GHASH block 8k+4
+ rev64 v11.16b, v11.16b //GHASH block 8k+3
+
+ eor v8.16b, v8.16b, v19.16b //PRE 1
+ rev32 v6.16b, v30.16b //CTR block 8k+14
+ add v30.4s, v30.4s, v31.4s //CTR block 8k+14
+
+ rev64 v13.16b, v13.16b //GHASH block 8k+5
+
+ rev32 v7.16b, v30.16b //CTR block 8k+15
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 0
+ aese v6.16b, v26.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 0
+
+ aese v5.16b, v26.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 0
+ aese v4.16b, v26.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 0
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 0
+
+ aese v7.16b, v26.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 0
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 0
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 0
+
+ pmull v19.1q, v8.1d, v25.1d //GHASH block 8k - low
+ pmull2 v16.1q, v9.2d, v23.2d //GHASH block 8k+1 - high
+ ldp q28, q26, [x8, #32] //load rk2, rk3
+
+ aese v6.16b, v27.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 1
+ pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low
+ ldr q20, [x3, #128] //load h5l | h5h
+ ext v20.16b, v20.16b, v20.16b, #8
+ ldr q22, [x3, #160] //load h6l | h6h
+ ext v22.16b, v22.16b, v22.16b, #8
+
+ aese v0.16b, v27.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 1
+ aese v3.16b, v27.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 1
+ aese v7.16b, v27.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 1
+
+ pmull2 v17.1q, v8.2d, v25.2d //GHASH block 8k - high
+ aese v2.16b, v27.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 1
+ aese v4.16b, v27.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 1
+
+ trn1 v18.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
+ rev64 v10.16b, v10.16b //GHASH block 8k+2
+ aese v1.16b, v27.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 1
+
+ aese v5.16b, v27.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 1
+ ldr q21, [x3, #144] //load h6k | h5k
+ ldr q24, [x3, #192] //load h8k | h7k
+ trn2 v8.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
+
+ eor v17.16b, v17.16b, v16.16b //GHASH block 8k+1 - high
+ pmull2 v9.1q, v11.2d, v20.2d //GHASH block 8k+3 - high
+ pmull2 v29.1q, v10.2d, v22.2d //GHASH block 8k+2 - high
+
+ eor v8.16b, v8.16b, v18.16b //GHASH block 8k, 8k+1 - mid
+ eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low
+ aese v6.16b, v28.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 2
+
+ aese v2.16b, v28.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 2
+ pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low
+.inst 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b, v9.16b //GHASH block 8k+2, 8k+3 - high
+
+ aese v1.16b, v28.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 2
+ aese v6.16b, v26.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 3
+ aese v4.16b, v28.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 2
+
+ aese v0.16b, v28.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 2
+ aese v7.16b, v28.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 2
+ aese v3.16b, v28.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 2
+
+ ldr q23, [x3, #80] //load h3l | h3h
+ ext v23.16b, v23.16b, v23.16b, #8
+ ldr q25, [x3, #112] //load h4l | h4h
+ ext v25.16b, v25.16b, v25.16b, #8
+ aese v5.16b, v28.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 2
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 3
+
+ pmull v22.1q, v10.1d, v22.1d //GHASH block 8k+2 - low
+ trn1 v29.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
+ trn2 v10.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
+
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 3
+ aese v4.16b, v26.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 3
+
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 3
+ aese v7.16b, v26.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 3
+ ldp q27, q28, [x8, #64] //load rk4, rk5
+
+ eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
+.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 3
+
+ trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
+ add v30.4s, v30.4s, v31.4s //CTR block 8k+15
+
+ pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid
+ pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid
+ pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid
+
+ aese v5.16b, v26.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 3
+ pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid
+ pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high
+
+ aese v4.16b, v27.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 4
+ aese v6.16b, v27.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 4
+ eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid
+
+ aese v5.16b, v27.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 4
+ aese v1.16b, v27.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 4
+ aese v3.16b, v27.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 4
+
+ aese v2.16b, v27.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 4
+ aese v0.16b, v27.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 4
+ aese v7.16b, v27.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 4
+
+ ldr q20, [x3, #32] //load h1l | h1h
+ ext v20.16b, v20.16b, v20.16b, #8
+ ldr q22, [x3, #64] //load h2l | h2h
+ ext v22.16b, v22.16b, v22.16b, #8
+ aese v3.16b, v28.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 5
+ aese v5.16b, v28.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 5
+
+ ldp q26, q27, [x8, #96] //load rk6, rk7
+ aese v7.16b, v28.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 5
+ rev64 v15.16b, v15.16b //GHASH block 8k+7
+
+ aese v4.16b, v28.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 5
+.inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
+ aese v1.16b, v28.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 5
+
+ pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low
+ trn2 v12.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
+ aese v2.16b, v28.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 5
+
+ aese v6.16b, v28.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 5
+ aese v0.16b, v28.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 5
+ rev64 v14.16b, v14.16b //GHASH block 8k+6
+
+ ldr q21, [x3, #48] //load h2k | h1k
+ ldr q24, [x3, #96] //load h4k | h3k
+ pmull2 v10.1q, v13.2d, v23.2d //GHASH block 8k+5 - high
+ pmull v23.1q, v13.1d, v23.1d //GHASH block 8k+5 - low
+
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 6
+ eor v12.16b, v12.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
+ trn1 v13.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
+
+ aese v7.16b, v26.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 6
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 6
+ aese v6.16b, v26.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 6
+
+ pmull2 v11.1q, v14.2d, v22.2d //GHASH block 8k+6 - high
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 6
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 6
+
+ aese v2.16b, v27.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 7
+ aese v6.16b, v27.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 7
+ aese v5.16b, v26.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 6
+
+ pmull2 v16.1q, v12.2d, v24.2d //GHASH block 8k+4 - mid
+.inst 0xce082a31 //eor3 v17.16b, v17.16b, v8.16b, v10.16b //GHASH block 8k+4, 8k+5 - high
+.inst 0xce195e73 //eor3 v19.16b, v19.16b, v25.16b, v23.16b //GHASH block 8k+4, 8k+5 - low
+
+ pmull v22.1q, v14.1d, v22.1d //GHASH block 8k+6 - low
+ trn2 v14.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
+ aese v4.16b, v26.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 6
+
+ aese v5.16b, v27.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 7
+ ldp q28, q26, [x8, #128] //load rk8, rk9
+ aese v3.16b, v27.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 7
+
+ eor v14.16b, v14.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
+ pmull v24.1q, v12.1d, v24.1d //GHASH block 8k+5 - mid
+ aese v1.16b, v27.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 7
+
+ aese v4.16b, v27.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 7
+ aese v0.16b, v27.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 7
+ aese v7.16b, v27.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 7
+
+.inst 0xce184252 //eor3 v18.16b, v18.16b, v24.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
+ pmull2 v13.1q, v14.2d, v21.2d //GHASH block 8k+6 - mid
+ pmull2 v12.1q, v15.2d, v20.2d //GHASH block 8k+7 - high
+
+ pmull v21.1q, v14.1d, v21.1d //GHASH block 8k+7 - mid
+ ldr d16, [x10] //MODULO - load modulo constant
+ pmull v20.1q, v15.1d, v20.1d //GHASH block 8k+7 - low
+
+ aese v2.16b, v28.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 8
+ aese v5.16b, v28.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 8
+ aese v7.16b, v28.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 8
+
+ aese v0.16b, v28.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 8
+ aese v3.16b, v28.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 8
+.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+6, 8k+7 - low
+
+ aese v4.16b, v28.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 8
+ aese v1.16b, v28.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 8
+ aese v6.16b, v28.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 8
+
+.inst 0xce0b3231 //eor3 v17.16b, v17.16b, v11.16b, v12.16b //GHASH block 8k+6, 8k+7 - high
+ rev32 v20.16b, v30.16b //CTR block 8k+16
+ add v30.4s, v30.4s, v31.4s //CTR block 8k+16
+
+ aese v5.16b, v26.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 9
+.inst 0xce153652 //eor3 v18.16b, v18.16b, v21.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 9
+
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 9
+ aese v7.16b, v26.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 9
+ ldp q27, q28, [x8, #160] //load rk10, rk11
+
+.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up
+ ldp q8, q9, [x0], #32 //AES block 8k+8, 8k+9 - load ciphertext
+
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 9
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 9
+ ldp q10, q11, [x0], #32 //AES block 8k+10, 8k+11 - load ciphertext
+
+ rev32 v22.16b, v30.16b //CTR block 8k+17
+ pmull v29.1q, v17.1d, v16.1d //MODULO - top 64b align with mid
+ add v30.4s, v30.4s, v31.4s //CTR block 8k+17
+
+ aese v6.16b, v26.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 9
+ aese v4.16b, v26.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 9
+ ext v21.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment
+
+ aese v3.16b, v27.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 10
+ aese v7.16b, v27.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 10
+ ldp q12, q13, [x0], #32 //AES block 8k+12, 8k+13 - load ciphertext
+
+ rev32 v23.16b, v30.16b //CTR block 8k+18
+ add v30.4s, v30.4s, v31.4s //CTR block 8k+18
+.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid
+
+ aese v0.16b, v27.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 10
+ aese v1.16b, v27.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 10
+ ldr q26, [x8, #192] //load rk12
+
+ ldp q14, q15, [x0], #32 //AES block 8k+14, 8k+15 - load ciphertext
+ aese v4.16b, v27.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 10
+ aese v6.16b, v27.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 10
+
+ aese v0.16b, v28.16b //AES block 8k+8 - round 11
+ ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment
+ aese v1.16b, v28.16b //AES block 8k+9 - round 11
+
+ aese v2.16b, v27.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 10
+ aese v6.16b, v28.16b //AES block 8k+14 - round 11
+ aese v3.16b, v28.16b //AES block 8k+11 - round 11
+
+.inst 0xce006900 //eor3 v0.16b, v8.16b, v0.16b, v26.16b //AES block 8k+8 - result
+ rev32 v25.16b, v30.16b //CTR block 8k+19
+ aese v5.16b, v27.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 10
+
+ aese v4.16b, v28.16b //AES block 8k+12 - round 11
+ aese v2.16b, v28.16b //AES block 8k+10 - round 11
+ add v30.4s, v30.4s, v31.4s //CTR block 8k+19
+
+ aese v7.16b, v28.16b //AES block 8k+15 - round 11
+ aese v5.16b, v28.16b //AES block 8k+13 - round 11
+ pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low
+
+.inst 0xce016921 //eor3 v1.16b, v9.16b, v1.16b, v26.16b //AES block 8k+9 - result
+ stp q0, q1, [x2], #32 //AES block 8k+8, 8k+9 - store result
+.inst 0xce036963 //eor3 v3.16b, v11.16b, v3.16b, v26.16b //AES block 8k+11 - result
+
+.inst 0xce026942 //eor3 v2.16b, v10.16b, v2.16b, v26.16b //AES block 8k+10 - result
+.inst 0xce0769e7 //eor3 v7.16b, v15.16b, v7.16b, v26.16b //AES block 8k+15 - result
+ stp q2, q3, [x2], #32 //AES block 8k+10, 8k+11 - store result
+
+.inst 0xce0569a5 //eor3 v5.16b, v13.16b, v5.16b, v26.16b //AES block 8k+13 - result
+.inst 0xce115673 //eor3 v19.16b, v19.16b, v17.16b, v21.16b //MODULO - fold into low
+ mov v3.16b, v25.16b //CTR block 8k+19
+
+.inst 0xce046984 //eor3 v4.16b, v12.16b, v4.16b, v26.16b //AES block 8k+12 - result
+ stp q4, q5, [x2], #32 //AES block 8k+12, 8k+13 - store result
+ cmp x0, x5 //.LOOP CONTROL
+
+.inst 0xce0669c6 //eor3 v6.16b, v14.16b, v6.16b, v26.16b //AES block 8k+14 - result
+ stp q6, q7, [x2], #32 //AES block 8k+14, 8k+15 - store result
+ mov v0.16b, v20.16b //CTR block 8k+16
+
+ mov v1.16b, v22.16b //CTR block 8k+17
+ mov v2.16b, v23.16b //CTR block 8k+18
+
+ rev32 v4.16b, v30.16b //CTR block 8k+20
+ add v30.4s, v30.4s, v31.4s //CTR block 8k+20
+ b.lt .L192_dec_main_loop
+
+.L192_dec_prepretail: //PREPRETAIL
+ ldp q26, q27, [x8, #0] //load rk0, rk1
+ rev32 v5.16b, v30.16b //CTR block 8k+13
+ add v30.4s, v30.4s, v31.4s //CTR block 8k+13
+
+ ldr q23, [x3, #176] //load h7l | h7h
+ ext v23.16b, v23.16b, v23.16b, #8
+ ldr q25, [x3, #208] //load h8l | h8h
+ ext v25.16b, v25.16b, v25.16b, #8
+ rev64 v8.16b, v8.16b //GHASH block 8k
+ ext v19.16b, v19.16b, v19.16b, #8 //PRE 0
+
+ rev64 v11.16b, v11.16b //GHASH block 8k+3
+ rev32 v6.16b, v30.16b //CTR block 8k+14
+ add v30.4s, v30.4s, v31.4s //CTR block 8k+14
+
+ eor v8.16b, v8.16b, v19.16b //PRE 1
+ rev64 v10.16b, v10.16b //GHASH block 8k+2
+ rev64 v9.16b, v9.16b //GHASH block 8k+1
+
+ ldr q20, [x3, #128] //load h5l | h5h
+ ext v20.16b, v20.16b, v20.16b, #8
+ ldr q22, [x3, #160] //load h6l | h6h
+ ext v22.16b, v22.16b, v22.16b, #8
+ rev32 v7.16b, v30.16b //CTR block 8k+15
+
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 0
+ aese v6.16b, v26.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 0
+ aese v5.16b, v26.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 0
+
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 0
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 0
+ pmull2 v16.1q, v9.2d, v23.2d //GHASH block 8k+1 - high
+
+ aese v4.16b, v26.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 0
+ pmull2 v17.1q, v8.2d, v25.2d //GHASH block 8k - high
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 0
+
+ aese v6.16b, v27.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 1
+ aese v7.16b, v26.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 0
+ ldp q28, q26, [x8, #32] //load rk2, rk3
+
+ aese v4.16b, v27.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 1
+ pmull2 v29.1q, v10.2d, v22.2d //GHASH block 8k+2 - high
+ pmull v22.1q, v10.1d, v22.1d //GHASH block 8k+2 - low
+
+ pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low
+ eor v17.16b, v17.16b, v16.16b //GHASH block 8k+1 - high
+ aese v3.16b, v27.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 1
+
+ pmull v19.1q, v8.1d, v25.1d //GHASH block 8k - low
+ aese v7.16b, v27.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 1
+ aese v0.16b, v27.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 1
+
+ trn1 v18.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
+ trn2 v8.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
+ pmull2 v9.1q, v11.2d, v20.2d //GHASH block 8k+3 - high
+
+ aese v2.16b, v27.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 1
+ aese v1.16b, v27.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 1
+ aese v5.16b, v27.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 1
+
+ ldr q21, [x3, #144] //load h6k | h5k
+ ldr q24, [x3, #192] //load h8k | h7k
+ aese v3.16b, v28.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 2
+ eor v8.16b, v8.16b, v18.16b //GHASH block 8k, 8k+1 - mid
+
+ aese v6.16b, v28.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 2
+ rev64 v13.16b, v13.16b //GHASH block 8k+5
+ pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low
+
+.inst 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b, v9.16b //GHASH block 8k+2, 8k+3 - high
+ aese v4.16b, v28.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 2
+ aese v5.16b, v28.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 2
+
+ trn1 v29.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 3
+ aese v7.16b, v28.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 2
+
+ aese v0.16b, v28.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 2
+ aese v2.16b, v28.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 2
+ trn2 v10.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
+
+ pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid
+ aese v1.16b, v28.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 2
+ pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid
+
+ aese v5.16b, v26.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 3
+ eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
+ eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low
+
+ aese v7.16b, v26.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 3
+ aese v6.16b, v26.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 3
+ aese v4.16b, v26.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 3
+
+.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low
+ ldp q27, q28, [x8, #64] //load rk4, rk5
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 3
+
+ ldr q23, [x3, #80] //load h3l | h3h
+ ext v23.16b, v23.16b, v23.16b, #8
+ ldr q25, [x3, #112] //load h4l | h4h
+ ext v25.16b, v25.16b, v25.16b, #8
+ pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid
+ pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid
+
+ ldr q20, [x3, #32] //load h1l | h1h
+ ext v20.16b, v20.16b, v20.16b, #8
+ ldr q22, [x3, #64] //load h2l | h2h
+ ext v22.16b, v22.16b, v22.16b, #8
+ eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 3
+
+ rev64 v15.16b, v15.16b //GHASH block 8k+7
+
+.inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
+ rev64 v12.16b, v12.16b //GHASH block 8k+4
+
+ aese v5.16b, v27.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 4
+ aese v4.16b, v27.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 4
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 3
+
+ aese v2.16b, v27.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 4
+ aese v0.16b, v27.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 4
+ aese v3.16b, v27.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 4
+
+ aese v1.16b, v27.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 4
+ aese v6.16b, v27.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 4
+ aese v7.16b, v27.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 4
+
+ rev64 v14.16b, v14.16b //GHASH block 8k+6
+ ldr q21, [x3, #48] //load h2k | h1k
+ ldr q24, [x3, #96] //load h4k | h3k
+ trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
+
+ aese v7.16b, v28.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 5
+ aese v1.16b, v28.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 5
+ aese v2.16b, v28.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 5
+
+ ldp q26, q27, [x8, #96] //load rk6, rk7
+ aese v6.16b, v28.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 5
+ aese v5.16b, v28.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 5
+
+ pmull2 v11.1q, v14.2d, v22.2d //GHASH block 8k+6 - high
+ pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high
+ pmull v22.1q, v14.1d, v22.1d //GHASH block 8k+6 - low
+
+ aese v4.16b, v28.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 5
+
+ pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low
+ trn2 v12.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
+ pmull2 v10.1q, v13.2d, v23.2d //GHASH block 8k+5 - high
+
+ pmull v23.1q, v13.1d, v23.1d //GHASH block 8k+5 - low
+ trn1 v13.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
+ aese v0.16b, v28.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 5
+
+ trn2 v14.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
+ aese v3.16b, v28.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 5
+ eor v12.16b, v12.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
+
+ aese v4.16b, v26.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 6
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 6
+
+ eor v14.16b, v14.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 6
+ aese v7.16b, v26.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 6
+
+ pmull2 v16.1q, v12.2d, v24.2d //GHASH block 8k+4 - mid
+ pmull v24.1q, v12.1d, v24.1d //GHASH block 8k+5 - mid
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 6
+
+ pmull2 v13.1q, v14.2d, v21.2d //GHASH block 8k+6 - mid
+ aese v5.16b, v26.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 6
+ pmull2 v12.1q, v15.2d, v20.2d //GHASH block 8k+7 - high
+
+.inst 0xce184252 //eor3 v18.16b, v18.16b, v24.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
+ aese v4.16b, v27.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 7
+.inst 0xce195e73 //eor3 v19.16b, v19.16b, v25.16b, v23.16b //GHASH block 8k+4, 8k+5 - low
+
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 6
+ aese v6.16b, v26.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 6
+ aese v5.16b, v27.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 7
+
+ ldp q28, q26, [x8, #128] //load rk8, rk9
+ pmull v21.1q, v14.1d, v21.1d //GHASH block 8k+7 - mid
+ aese v2.16b, v27.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 7
+
+ ldr d16, [x10] //MODULO - load modulo constant
+.inst 0xce082a31 //eor3 v17.16b, v17.16b, v8.16b, v10.16b //GHASH block 8k+4, 8k+5 - high
+ pmull v20.1q, v15.1d, v20.1d //GHASH block 8k+7 - low
+
+ aese v1.16b, v27.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 7
+ aese v7.16b, v27.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 7
+ aese v6.16b, v27.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 7
+
+.inst 0xce0b3231 //eor3 v17.16b, v17.16b, v11.16b, v12.16b //GHASH block 8k+6, 8k+7 - high
+.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+6, 8k+7 - low
+.inst 0xce153652 //eor3 v18.16b, v18.16b, v21.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
+
+ aese v0.16b, v27.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 7
+ aese v3.16b, v27.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 7
+
+.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up
+ ext v21.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment
+ aese v2.16b, v28.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 8
+
+ aese v6.16b, v28.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 8
+ aese v7.16b, v28.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 8
+ aese v1.16b, v28.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 8
+
+ aese v3.16b, v28.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 8
+ pmull v29.1q, v17.1d, v16.1d //MODULO - top 64b align with mid
+ aese v0.16b, v28.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 8
+
+ aese v5.16b, v28.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 8
+ aese v4.16b, v28.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 8
+ ldp q27, q28, [x8, #160] //load rk10, rk11
+
+.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid
+ aese v7.16b, v26.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 9
+ aese v6.16b, v26.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 9
+
+ aese v5.16b, v26.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 9
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 9
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 9
+
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 9
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 9
+ aese v4.16b, v26.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 9
+
+ pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low
+ ldr q26, [x8, #192] //load rk12
+ ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment
+
+ aese v2.16b, v27.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 10
+ aese v5.16b, v27.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 10
+ aese v0.16b, v27.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 10
+
+ aese v4.16b, v27.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 10
+ aese v6.16b, v27.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 10
+ aese v7.16b, v27.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 10
+
+ aese v0.16b, v28.16b //AES block 8k+8 - round 11
+.inst 0xce115673 //eor3 v19.16b, v19.16b, v17.16b, v21.16b //MODULO - fold into low
+ aese v5.16b, v28.16b //AES block 8k+13 - round 11
+
+ aese v2.16b, v28.16b //AES block 8k+10 - round 11
+ aese v3.16b, v27.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 10
+ aese v1.16b, v27.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 10
+
+ aese v6.16b, v28.16b //AES block 8k+14 - round 11
+ aese v4.16b, v28.16b //AES block 8k+12 - round 11
+ add v30.4s, v30.4s, v31.4s //CTR block 8k+15
+
+ aese v3.16b, v28.16b //AES block 8k+11 - round 11
+ aese v1.16b, v28.16b //AES block 8k+9 - round 11
+ aese v7.16b, v28.16b //AES block 8k+15 - round 11
+
+.L192_dec_tail: //TAIL
+
+ sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process
+
+ ldp q20, q21, [x3, #128] //load h5l | h5h
+ ext v20.16b, v20.16b, v20.16b, #8
+ ldr q9, [x0], #16 //AES block 8k+8 - load ciphertext
+
+ ldp q24, q25, [x3, #192] //load h8k | h7k
+ ext v25.16b, v25.16b, v25.16b, #8
+
+ mov v29.16b, v26.16b
+
+ ldp q22, q23, [x3, #160] //load h6l | h6h
+ ext v22.16b, v22.16b, v22.16b, #8
+ ext v23.16b, v23.16b, v23.16b, #8
+ ext v16.16b, v19.16b, v19.16b, #8 //prepare final partial tag
+
+.inst 0xce00752c //eor3 v12.16b, v9.16b, v0.16b, v29.16b //AES block 8k+8 - result
+ cmp x5, #112
+ b.gt .L192_dec_blocks_more_than_7
+
+ mov v7.16b, v6.16b
+ movi v17.8b, #0
+ sub v30.4s, v30.4s, v31.4s
+
+ mov v6.16b, v5.16b
+ mov v5.16b, v4.16b
+ mov v4.16b, v3.16b
+
+ cmp x5, #96
+ movi v19.8b, #0
+ mov v3.16b, v2.16b
+
+ mov v2.16b, v1.16b
+ movi v18.8b, #0
+ b.gt .L192_dec_blocks_more_than_6
+
+ mov v7.16b, v6.16b
+ mov v6.16b, v5.16b
+ mov v5.16b, v4.16b
+
+ mov v4.16b, v3.16b
+ mov v3.16b, v1.16b
+
+ sub v30.4s, v30.4s, v31.4s
+ cmp x5, #80
+ b.gt .L192_dec_blocks_more_than_5
+
+ mov v7.16b, v6.16b
+ mov v6.16b, v5.16b
+
+ mov v5.16b, v4.16b
+ mov v4.16b, v1.16b
+ cmp x5, #64
+
+ sub v30.4s, v30.4s, v31.4s
+ b.gt .L192_dec_blocks_more_than_4
+
+ sub v30.4s, v30.4s, v31.4s
+ mov v7.16b, v6.16b
+ mov v6.16b, v5.16b
+
+ mov v5.16b, v1.16b
+ cmp x5, #48
+ b.gt .L192_dec_blocks_more_than_3
+
+ sub v30.4s, v30.4s, v31.4s
+ mov v7.16b, v6.16b
+ cmp x5, #32
+
+ mov v6.16b, v1.16b
+ ldr q24, [x3, #96] //load h4k | h3k
+ b.gt .L192_dec_blocks_more_than_2
+
+ sub v30.4s, v30.4s, v31.4s
+
+ mov v7.16b, v1.16b
+ cmp x5, #16
+ b.gt .L192_dec_blocks_more_than_1
+
+ sub v30.4s, v30.4s, v31.4s
+ ldr q21, [x3, #48] //load h2k | h1k
+ b .L192_dec_blocks_less_than_1
+.L192_dec_blocks_more_than_7: //blocks left > 7
+ rev64 v8.16b, v9.16b //GHASH final-7 block
+
+ ins v18.d[0], v24.d[1] //GHASH final-7 block - mid
+ eor v8.16b, v8.16b, v16.16b //feed in partial tag
+
+ pmull2 v17.1q, v8.2d, v25.2d //GHASH final-7 block - high
+ ins v27.d[0], v8.d[1] //GHASH final-7 block - mid
+ ldr q9, [x0], #16 //AES final-6 block - load ciphertext
+
+ pmull v19.1q, v8.1d, v25.1d //GHASH final-7 block - low
+
+ eor v27.8b, v27.8b, v8.8b //GHASH final-7 block - mid
+ st1 { v12.16b}, [x2], #16 //AES final-7 block - store result
+
+.inst 0xce01752c //eor3 v12.16b, v9.16b, v1.16b, v29.16b //AES final-6 block - result
+
+ pmull v18.1q, v27.1d, v18.1d //GHASH final-7 block - mid
+ movi v16.8b, #0 //suppress further partial tag feed in
+.L192_dec_blocks_more_than_6: //blocks left > 6
+
+ rev64 v8.16b, v9.16b //GHASH final-6 block
+
+ eor v8.16b, v8.16b, v16.16b //feed in partial tag
+
+ ldr q9, [x0], #16 //AES final-5 block - load ciphertext
+ ins v27.d[0], v8.d[1] //GHASH final-6 block - mid
+
+ eor v27.8b, v27.8b, v8.8b //GHASH final-6 block - mid
+ movi v16.8b, #0 //suppress further partial tag feed in
+ pmull2 v28.1q, v8.2d, v23.2d //GHASH final-6 block - high
+
+ st1 { v12.16b}, [x2], #16 //AES final-6 block - store result
+.inst 0xce02752c //eor3 v12.16b, v9.16b, v2.16b, v29.16b //AES final-5 block - result
+
+ eor v17.16b, v17.16b, v28.16b //GHASH final-6 block - high
+ pmull v27.1q, v27.1d, v24.1d //GHASH final-6 block - mid
+ pmull v26.1q, v8.1d, v23.1d //GHASH final-6 block - low
+
+ eor v18.16b, v18.16b, v27.16b //GHASH final-6 block - mid
+ eor v19.16b, v19.16b, v26.16b //GHASH final-6 block - low
+.L192_dec_blocks_more_than_5: //blocks left > 5
+
+ rev64 v8.16b, v9.16b //GHASH final-5 block
+
+ eor v8.16b, v8.16b, v16.16b //feed in partial tag
+
+ ins v27.d[0], v8.d[1] //GHASH final-5 block - mid
+
+ eor v27.8b, v27.8b, v8.8b //GHASH final-5 block - mid
+
+ ins v27.d[1], v27.d[0] //GHASH final-5 block - mid
+ pmull2 v28.1q, v8.2d, v22.2d //GHASH final-5 block - high
+
+ ldr q9, [x0], #16 //AES final-4 block - load ciphertext
+
+ eor v17.16b, v17.16b, v28.16b //GHASH final-5 block - high
+ pmull v26.1q, v8.1d, v22.1d //GHASH final-5 block - low
+
+ pmull2 v27.1q, v27.2d, v21.2d //GHASH final-5 block - mid
+
+ eor v19.16b, v19.16b, v26.16b //GHASH final-5 block - low
+ movi v16.8b, #0 //suppress further partial tag feed in
+ st1 { v12.16b}, [x2], #16 //AES final-5 block - store result
+
+ eor v18.16b, v18.16b, v27.16b //GHASH final-5 block - mid
+.inst 0xce03752c //eor3 v12.16b, v9.16b, v3.16b, v29.16b //AES final-4 block - result
+.L192_dec_blocks_more_than_4: //blocks left > 4
+
+ rev64 v8.16b, v9.16b //GHASH final-4 block
+
+ eor v8.16b, v8.16b, v16.16b //feed in partial tag
+ movi v16.8b, #0 //suppress further partial tag feed in
+
+ ldr q9, [x0], #16 //AES final-3 block - load ciphertext
+ ins v27.d[0], v8.d[1] //GHASH final-4 block - mid
+ pmull v26.1q, v8.1d, v20.1d //GHASH final-4 block - low
+
+ eor v27.8b, v27.8b, v8.8b //GHASH final-4 block - mid
+
+ eor v19.16b, v19.16b, v26.16b //GHASH final-4 block - low
+
+ pmull v27.1q, v27.1d, v21.1d //GHASH final-4 block - mid
+ st1 { v12.16b}, [x2], #16 //AES final-4 block - store result
+ pmull2 v28.1q, v8.2d, v20.2d //GHASH final-4 block - high
+
+.inst 0xce04752c //eor3 v12.16b, v9.16b, v4.16b, v29.16b //AES final-3 block - result
+
+ eor v18.16b, v18.16b, v27.16b //GHASH final-4 block - mid
+ eor v17.16b, v17.16b, v28.16b //GHASH final-4 block - high
+.L192_dec_blocks_more_than_3: //blocks left > 3
+
+ ldr q25, [x3, #112] //load h4l | h4h
+ ext v25.16b, v25.16b, v25.16b, #8
+ rev64 v8.16b, v9.16b //GHASH final-3 block
+ ldr q9, [x0], #16 //AES final-2 block - load ciphertext
+
+ eor v8.16b, v8.16b, v16.16b //feed in partial tag
+
+ ins v27.d[0], v8.d[1] //GHASH final-3 block - mid
+ pmull2 v28.1q, v8.2d, v25.2d //GHASH final-3 block - high
+
+ eor v17.16b, v17.16b, v28.16b //GHASH final-3 block - high
+ movi v16.8b, #0 //suppress further partial tag feed in
+ pmull v26.1q, v8.1d, v25.1d //GHASH final-3 block - low
+
+ st1 { v12.16b}, [x2], #16 //AES final-3 block - store result
+ eor v27.8b, v27.8b, v8.8b //GHASH final-3 block - mid
+.inst 0xce05752c //eor3 v12.16b, v9.16b, v5.16b, v29.16b //AES final-2 block - result
+
+ eor v19.16b, v19.16b, v26.16b //GHASH final-3 block - low
+ ldr q24, [x3, #96] //load h4k | h3k
+
+ ins v27.d[1], v27.d[0] //GHASH final-3 block - mid
+
+ pmull2 v27.1q, v27.2d, v24.2d //GHASH final-3 block - mid
+
+ eor v18.16b, v18.16b, v27.16b //GHASH final-3 block - mid
+.L192_dec_blocks_more_than_2: //blocks left > 2
+
+ rev64 v8.16b, v9.16b //GHASH final-2 block
+ ldr q23, [x3, #80] //load h3l | h3h
+ ext v23.16b, v23.16b, v23.16b, #8
+
+ eor v8.16b, v8.16b, v16.16b //feed in partial tag
+
+ ins v27.d[0], v8.d[1] //GHASH final-2 block - mid
+ ldr q9, [x0], #16 //AES final-1 block - load ciphertext
+
+ pmull2 v28.1q, v8.2d, v23.2d //GHASH final-2 block - high
+
+ eor v27.8b, v27.8b, v8.8b //GHASH final-2 block - mid
+
+ eor v17.16b, v17.16b, v28.16b //GHASH final-2 block - high
+ pmull v26.1q, v8.1d, v23.1d //GHASH final-2 block - low
+
+ pmull v27.1q, v27.1d, v24.1d //GHASH final-2 block - mid
+ movi v16.8b, #0 //suppress further partial tag feed in
+
+ eor v19.16b, v19.16b, v26.16b //GHASH final-2 block - low
+ st1 { v12.16b}, [x2], #16 //AES final-2 block - store result
+
+ eor v18.16b, v18.16b, v27.16b //GHASH final-2 block - mid
+.inst 0xce06752c //eor3 v12.16b, v9.16b, v6.16b, v29.16b //AES final-1 block - result
+.L192_dec_blocks_more_than_1: //blocks left > 1
+
+ rev64 v8.16b, v9.16b //GHASH final-1 block
+ ldr q9, [x0], #16 //AES final block - load ciphertext
+ ldr q22, [x3, #64] //load h1l | h1h
+ ext v22.16b, v22.16b, v22.16b, #8
+
+ eor v8.16b, v8.16b, v16.16b //feed in partial tag
+ movi v16.8b, #0 //suppress further partial tag feed in
+ ldr q21, [x3, #48] //load h2k | h1k
+
+ pmull v26.1q, v8.1d, v22.1d //GHASH final-1 block - low
+ ins v27.d[0], v8.d[1] //GHASH final-1 block - mid
+ st1 { v12.16b}, [x2], #16 //AES final-1 block - store result
+
+ pmull2 v28.1q, v8.2d, v22.2d //GHASH final-1 block - high
+
+.inst 0xce07752c //eor3 v12.16b, v9.16b, v7.16b, v29.16b //AES final block - result
+
+ eor v27.8b, v27.8b, v8.8b //GHASH final-1 block - mid
+
+ ins v27.d[1], v27.d[0] //GHASH final-1 block - mid
+
+ pmull2 v27.1q, v27.2d, v21.2d //GHASH final-1 block - mid
+
+ eor v19.16b, v19.16b, v26.16b //GHASH final-1 block - low
+
+ eor v18.16b, v18.16b, v27.16b //GHASH final-1 block - mid
+ eor v17.16b, v17.16b, v28.16b //GHASH final-1 block - high
+.L192_dec_blocks_less_than_1: //blocks left <= 1
+
+ rev32 v30.16b, v30.16b
+ and x1, x1, #127 //bit_length %= 128
+
+ sub x1, x1, #128 //bit_length -= 128
+ str q30, [x16] //store the updated counter
+
+ neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128])
+ mvn x6, xzr //temp0_x = 0xffffffffffffffff
+
+ and x1, x1, #127 //bit_length %= 128
+
+ mvn x7, xzr //temp1_x = 0xffffffffffffffff
+ lsr x6, x6, x1 //temp0_x is mask for top 64b of last block
+ cmp x1, #64
+
+ csel x13, x7, x6, lt
+ csel x14, x6, xzr, lt
+ ldr q20, [x3, #32] //load h1l | h1h
+ ext v20.16b, v20.16b, v20.16b, #8
+
+ mov v0.d[1], x14
+ ld1 { v26.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored
+
+ mov v0.d[0], x13 //ctr0b is mask for last block
+
+ and v9.16b, v9.16b, v0.16b //possibly partial last block has zeroes in highest bits
+ bif v12.16b, v26.16b, v0.16b //insert existing bytes in top end of result before storing
+
+ rev64 v8.16b, v9.16b //GHASH final block
+
+ st1 { v12.16b}, [x2] //store all 16B
+
+ eor v8.16b, v8.16b, v16.16b //feed in partial tag
+
+ ins v16.d[0], v8.d[1] //GHASH final block - mid
+ pmull v26.1q, v8.1d, v20.1d //GHASH final block - low
+
+ eor v16.8b, v16.8b, v8.8b //GHASH final block - mid
+ pmull2 v28.1q, v8.2d, v20.2d //GHASH final block - high
+ eor v19.16b, v19.16b, v26.16b //GHASH final block - low
+
+ pmull v16.1q, v16.1d, v21.1d //GHASH final block - mid
+ eor v17.16b, v17.16b, v28.16b //GHASH final block - high
+
+ eor v14.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up
+ eor v18.16b, v18.16b, v16.16b //GHASH final block - mid
+ ldr d16, [x10] //MODULO - load modulo constant
+
+ pmull v21.1q, v17.1d, v16.1d //MODULO - top 64b align with mid
+ ext v17.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment
+
+ eor v18.16b, v18.16b, v14.16b //MODULO - karatsuba tidy up
+
+.inst 0xce115652 //eor3 v18.16b, v18.16b, v17.16b, v21.16b //MODULO - fold into mid
+
+ pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low
+ ext v18.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment
+
+.inst 0xce124673 //eor3 v19.16b, v19.16b, v18.16b, v17.16b //MODULO - fold into low
+ ext v19.16b, v19.16b, v19.16b, #8
+ rev64 v19.16b, v19.16b
+ st1 { v19.16b }, [x3]
+
+ mov x0, x9
+
+ ldp d10, d11, [sp, #16]
+ ldp d12, d13, [sp, #32]
+ ldp d14, d15, [sp, #48]
+ ldp d8, d9, [sp], #80
+ ret
+
+.L192_dec_ret:
+ mov w0, #0x0
+ ret
+.size unroll8_eor3_aes_gcm_dec_192_kernel,.-unroll8_eor3_aes_gcm_dec_192_kernel
+.globl unroll8_eor3_aes_gcm_enc_256_kernel
+.type unroll8_eor3_aes_gcm_enc_256_kernel,%function
+.align 4
+unroll8_eor3_aes_gcm_enc_256_kernel:
+ AARCH64_VALID_CALL_TARGET
+ cbz x1, .L256_enc_ret
+ stp d8, d9, [sp, #-80]!
+ lsr x9, x1, #3
+ mov x16, x4
+ mov x8, x5
+ stp d10, d11, [sp, #16]
+ stp d12, d13, [sp, #32]
+ stp d14, d15, [sp, #48]
+ mov x5, #0xc200000000000000
+ stp x5, xzr, [sp, #64]
+ add x10, sp, #64
+
+ ld1 { v0.16b}, [x16] //CTR block 0
+
+ mov x5, x9
+
+ mov x15, #0x100000000 //set up counter increment
+ movi v31.16b, #0x0
+ mov v31.d[1], x15
+ sub x5, x5, #1 //byte_len - 1
+
+ and x5, x5, #0xffffffffffffff80 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
+
+ add x5, x5, x0
+
+ rev32 v30.16b, v0.16b //set up reversed counter
+
+ add v30.4s, v30.4s, v31.4s //CTR block 0
+
+ rev32 v1.16b, v30.16b //CTR block 1
+ add v30.4s, v30.4s, v31.4s //CTR block 1
+
+ rev32 v2.16b, v30.16b //CTR block 2
+ add v30.4s, v30.4s, v31.4s //CTR block 2
+
+ rev32 v3.16b, v30.16b //CTR block 3
+ add v30.4s, v30.4s, v31.4s //CTR block 3
+
+ rev32 v4.16b, v30.16b //CTR block 4
+ add v30.4s, v30.4s, v31.4s //CTR block 4
+
+ rev32 v5.16b, v30.16b //CTR block 5
+ add v30.4s, v30.4s, v31.4s //CTR block 5
+ ldp q26, q27, [x8, #0] //load rk0, rk1
+
+ rev32 v6.16b, v30.16b //CTR block 6
+ add v30.4s, v30.4s, v31.4s //CTR block 6
+
+ rev32 v7.16b, v30.16b //CTR block 7
+
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b //AES block 3 - round 0
+ aese v4.16b, v26.16b
+ aesmc v4.16b, v4.16b //AES block 4 - round 0
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b //AES block 2 - round 0
+
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b //AES block 0 - round 0
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b //AES block 1 - round 0
+ aese v6.16b, v26.16b
+ aesmc v6.16b, v6.16b //AES block 6 - round 0
+
+ aese v5.16b, v26.16b
+ aesmc v5.16b, v5.16b //AES block 5 - round 0
+ aese v7.16b, v26.16b
+ aesmc v7.16b, v7.16b //AES block 7 - round 0
+ ldp q28, q26, [x8, #32] //load rk2, rk3
+
+ aese v4.16b, v27.16b
+ aesmc v4.16b, v4.16b //AES block 4 - round 1
+ aese v1.16b, v27.16b
+ aesmc v1.16b, v1.16b //AES block 1 - round 1
+ aese v3.16b, v27.16b
+ aesmc v3.16b, v3.16b //AES block 3 - round 1
+
+ aese v6.16b, v27.16b
+ aesmc v6.16b, v6.16b //AES block 6 - round 1
+ aese v5.16b, v27.16b
+ aesmc v5.16b, v5.16b //AES block 5 - round 1
+
+ aese v2.16b, v27.16b
+ aesmc v2.16b, v2.16b //AES block 2 - round 1
+
+ aese v7.16b, v27.16b
+ aesmc v7.16b, v7.16b //AES block 7 - round 1
+
+ aese v2.16b, v28.16b
+ aesmc v2.16b, v2.16b //AES block 2 - round 2
+ aese v3.16b, v28.16b
+ aesmc v3.16b, v3.16b //AES block 3 - round 2
+ aese v0.16b, v27.16b
+ aesmc v0.16b, v0.16b //AES block 0 - round 1
+
+ aese v7.16b, v28.16b
+ aesmc v7.16b, v7.16b //AES block 7 - round 2
+ aese v6.16b, v28.16b
+ aesmc v6.16b, v6.16b //AES block 6 - round 2
+ aese v5.16b, v28.16b
+ aesmc v5.16b, v5.16b //AES block 5 - round 2
+
+ aese v4.16b, v28.16b
+ aesmc v4.16b, v4.16b //AES block 4 - round 2
+ aese v0.16b, v28.16b
+ aesmc v0.16b, v0.16b //AES block 0 - round 2
+ aese v1.16b, v28.16b
+ aesmc v1.16b, v1.16b //AES block 1 - round 2
+
+ aese v5.16b, v26.16b
+ aesmc v5.16b, v5.16b //AES block 5 - round 3
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b //AES block 3 - round 3
+ ldp q27, q28, [x8, #64] //load rk4, rk5
+
+ aese v4.16b, v26.16b
+ aesmc v4.16b, v4.16b //AES block 4 - round 3
+
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b //AES block 1 - round 3
+ aese v6.16b, v26.16b
+ aesmc v6.16b, v6.16b //AES block 6 - round 3
+ aese v7.16b, v26.16b
+ aesmc v7.16b, v7.16b //AES block 7 - round 3
+
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b //AES block 2 - round 3
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b //AES block 0 - round 3
+
+ aese v4.16b, v27.16b
+ aesmc v4.16b, v4.16b //AES block 4 - round 4
+ aese v6.16b, v27.16b
+ aesmc v6.16b, v6.16b //AES block 6 - round 4
+ aese v1.16b, v27.16b
+ aesmc v1.16b, v1.16b //AES block 1 - round 4
+
+ aese v2.16b, v27.16b
+ aesmc v2.16b, v2.16b //AES block 2 - round 4
+ aese v0.16b, v27.16b
+ aesmc v0.16b, v0.16b //AES block 0 - round 4
+
+ aese v3.16b, v27.16b
+ aesmc v3.16b, v3.16b //AES block 3 - round 4
+ aese v7.16b, v27.16b
+ aesmc v7.16b, v7.16b //AES block 7 - round 4
+ aese v5.16b, v27.16b
+ aesmc v5.16b, v5.16b //AES block 5 - round 4
+
+ aese v0.16b, v28.16b
+ aesmc v0.16b, v0.16b //AES block 0 - round 5
+ aese v2.16b, v28.16b
+ aesmc v2.16b, v2.16b //AES block 2 - round 5
+ ldp q26, q27, [x8, #96] //load rk6, rk7
+
+ aese v1.16b, v28.16b
+ aesmc v1.16b, v1.16b //AES block 1 - round 5
+ aese v4.16b, v28.16b
+ aesmc v4.16b, v4.16b //AES block 4 - round 5
+ aese v5.16b, v28.16b
+ aesmc v5.16b, v5.16b //AES block 5 - round 5
+
+ aese v3.16b, v28.16b
+ aesmc v3.16b, v3.16b //AES block 3 - round 5
+ aese v6.16b, v28.16b
+ aesmc v6.16b, v6.16b //AES block 6 - round 5
+ aese v7.16b, v28.16b
+ aesmc v7.16b, v7.16b //AES block 7 - round 5
+
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b //AES block 1 - round 6
+ aese v5.16b, v26.16b
+ aesmc v5.16b, v5.16b //AES block 5 - round 6
+ aese v4.16b, v26.16b
+ aesmc v4.16b, v4.16b //AES block 4 - round 6
+
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b //AES block 2 - round 6
+ aese v6.16b, v26.16b
+ aesmc v6.16b, v6.16b //AES block 6 - round 6
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b //AES block 0 - round 6
+
+ aese v7.16b, v26.16b
+ aesmc v7.16b, v7.16b //AES block 7 - round 6
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b //AES block 3 - round 6
+ ldp q28, q26, [x8, #128] //load rk8, rk9
+
+ aese v2.16b, v27.16b
+ aesmc v2.16b, v2.16b //AES block 2 - round 7
+ aese v0.16b, v27.16b
+ aesmc v0.16b, v0.16b //AES block 0 - round 7
+
+ aese v7.16b, v27.16b
+ aesmc v7.16b, v7.16b //AES block 7 - round 7
+ aese v6.16b, v27.16b
+ aesmc v6.16b, v6.16b //AES block 6 - round 7
+ aese v1.16b, v27.16b
+ aesmc v1.16b, v1.16b //AES block 1 - round 7
+
+ aese v5.16b, v27.16b
+ aesmc v5.16b, v5.16b //AES block 5 - round 7
+ aese v3.16b, v27.16b
+ aesmc v3.16b, v3.16b //AES block 3 - round 7
+
+ aese v4.16b, v27.16b
+ aesmc v4.16b, v4.16b //AES block 4 - round 7
+
+ aese v6.16b, v28.16b
+ aesmc v6.16b, v6.16b //AES block 6 - round 8
+ aese v1.16b, v28.16b
+ aesmc v1.16b, v1.16b //AES block 1 - round 8
+
+ aese v3.16b, v28.16b
+ aesmc v3.16b, v3.16b //AES block 3 - round 8
+ aese v0.16b, v28.16b
+ aesmc v0.16b, v0.16b //AES block 0 - round 8
+ aese v7.16b, v28.16b
+ aesmc v7.16b, v7.16b //AES block 7 - round 8
+
+ aese v5.16b, v28.16b
+ aesmc v5.16b, v5.16b //AES block 5 - round 8
+ aese v4.16b, v28.16b
+ aesmc v4.16b, v4.16b //AES block 4 - round 8
+ aese v2.16b, v28.16b
+ aesmc v2.16b, v2.16b //AES block 2 - round 8
+
+ ld1 { v19.16b}, [x3]
+ ext v19.16b, v19.16b, v19.16b, #8
+ rev64 v19.16b, v19.16b
+ ldp q27, q28, [x8, #160] //load rk10, rk11
+
+ aese v6.16b, v26.16b
+ aesmc v6.16b, v6.16b //AES block 6 - round 9
+ aese v7.16b, v26.16b
+ aesmc v7.16b, v7.16b //AES block 7 - round 9
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b //AES block 3 - round 9
+
+ aese v4.16b, v26.16b
+ aesmc v4.16b, v4.16b //AES block 4 - round 9
+ aese v5.16b, v26.16b
+ aesmc v5.16b, v5.16b //AES block 5 - round 9
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b //AES block 2 - round 9
+
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b //AES block 1 - round 9
+
+ aese v7.16b, v27.16b
+ aesmc v7.16b, v7.16b //AES block 7 - round 10
+ aese v4.16b, v27.16b
+ aesmc v4.16b, v4.16b //AES block 4 - round 10
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b //AES block 0 - round 9
+
+ aese v1.16b, v27.16b
+ aesmc v1.16b, v1.16b //AES block 1 - round 10
+ aese v5.16b, v27.16b
+ aesmc v5.16b, v5.16b //AES block 5 - round 10
+ aese v3.16b, v27.16b
+ aesmc v3.16b, v3.16b //AES block 3 - round 10
+
+ aese v2.16b, v27.16b
+ aesmc v2.16b, v2.16b //AES block 2 - round 10
+ aese v0.16b, v27.16b
+ aesmc v0.16b, v0.16b //AES block 0 - round 10
+ aese v6.16b, v27.16b
+ aesmc v6.16b, v6.16b //AES block 6 - round 10
+
+ aese v4.16b, v28.16b
+ aesmc v4.16b, v4.16b //AES block 4 - round 11
+ ldp q26, q27, [x8, #192] //load rk12, rk13
+ aese v5.16b, v28.16b
+ aesmc v5.16b, v5.16b //AES block 5 - round 11
+
+ aese v2.16b, v28.16b
+ aesmc v2.16b, v2.16b //AES block 2 - round 11
+ aese v6.16b, v28.16b
+ aesmc v6.16b, v6.16b //AES block 6 - round 11
+ aese v1.16b, v28.16b
+ aesmc v1.16b, v1.16b //AES block 1 - round 11
+
+ aese v0.16b, v28.16b
+ aesmc v0.16b, v0.16b //AES block 0 - round 11
+ aese v3.16b, v28.16b
+ aesmc v3.16b, v3.16b //AES block 3 - round 11
+ aese v7.16b, v28.16b
+ aesmc v7.16b, v7.16b //AES block 7 - round 11
+
+ add v30.4s, v30.4s, v31.4s //CTR block 7
+ ldr q28, [x8, #224] //load rk14
+
+ aese v4.16b, v26.16b
+ aesmc v4.16b, v4.16b //AES block 4 - round 12
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b //AES block 2 - round 12
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b //AES block 1 - round 12
+
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b //AES block 0 - round 12
+ aese v5.16b, v26.16b
+ aesmc v5.16b, v5.16b //AES block 5 - round 12
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b //AES block 3 - round 12
+
+ aese v2.16b, v27.16b //AES block 2 - round 13
+ aese v1.16b, v27.16b //AES block 1 - round 13
+ aese v4.16b, v27.16b //AES block 4 - round 13
+
+ aese v6.16b, v26.16b
+ aesmc v6.16b, v6.16b //AES block 6 - round 12
+ aese v7.16b, v26.16b
+ aesmc v7.16b, v7.16b //AES block 7 - round 12
+
+ aese v0.16b, v27.16b //AES block 0 - round 13
+ aese v5.16b, v27.16b //AES block 5 - round 13
+
+ aese v6.16b, v27.16b //AES block 6 - round 13
+ aese v7.16b, v27.16b //AES block 7 - round 13
+ aese v3.16b, v27.16b //AES block 3 - round 13
+
+ add x4, x0, x1, lsr #3 //end_input_ptr
+ cmp x0, x5 //check if we have <= 8 blocks
+ b.ge .L256_enc_tail //handle tail
+
+ ldp q8, q9, [x0], #32 //AES block 0, 1 - load plaintext
+
+ ldp q10, q11, [x0], #32 //AES block 2, 3 - load plaintext
+
+.inst 0xce007108 //eor3 v8.16b, v8.16b, v0.16b, v28.16b //AES block 0 - result
+ rev32 v0.16b, v30.16b //CTR block 8
+ add v30.4s, v30.4s, v31.4s //CTR block 8
+
+.inst 0xce017129 //eor3 v9.16b, v9.16b, v1.16b, v28.16b //AES block 1 - result
+.inst 0xce03716b //eor3 v11.16b, v11.16b, v3.16b, v28.16b //AES block 3 - result
+
+ rev32 v1.16b, v30.16b //CTR block 9
+ add v30.4s, v30.4s, v31.4s //CTR block 9
+ ldp q12, q13, [x0], #32 //AES block 4, 5 - load plaintext
+
+ ldp q14, q15, [x0], #32 //AES block 6, 7 - load plaintext
+.inst 0xce02714a //eor3 v10.16b, v10.16b, v2.16b, v28.16b //AES block 2 - result
+ cmp x0, x5 //check if we have <= 8 blocks
+
+ rev32 v2.16b, v30.16b //CTR block 10
+ add v30.4s, v30.4s, v31.4s //CTR block 10
+ stp q8, q9, [x2], #32 //AES block 0, 1 - store result
+
+ stp q10, q11, [x2], #32 //AES block 2, 3 - store result
+
+ rev32 v3.16b, v30.16b //CTR block 11
+ add v30.4s, v30.4s, v31.4s //CTR block 11
+
+.inst 0xce04718c //eor3 v12.16b, v12.16b, v4.16b, v28.16b //AES block 4 - result
+
+.inst 0xce0771ef //eor3 v15.16b, v15.16b, v7.16b, v28.16b //AES block 7 - result
+.inst 0xce0671ce //eor3 v14.16b, v14.16b, v6.16b, v28.16b //AES block 6 - result
+.inst 0xce0571ad //eor3 v13.16b, v13.16b, v5.16b, v28.16b //AES block 5 - result
+
+ stp q12, q13, [x2], #32 //AES block 4, 5 - store result
+ rev32 v4.16b, v30.16b //CTR block 12
+
+ stp q14, q15, [x2], #32 //AES block 6, 7 - store result
+ add v30.4s, v30.4s, v31.4s //CTR block 12
+ b.ge .L256_enc_prepretail //do prepretail
+
+.L256_enc_main_loop: //main loop start
+ ldp q26, q27, [x8, #0] //load rk0, rk1
+
+ rev32 v5.16b, v30.16b //CTR block 8k+13
+ add v30.4s, v30.4s, v31.4s //CTR block 8k+13
+ ldr q21, [x3, #144] //load h6k | h5k
+ ldr q24, [x3, #192] //load h8k | h7k
+
+ rev64 v11.16b, v11.16b //GHASH block 8k+3
+ ldr q20, [x3, #128] //load h5l | h5h
+ ext v20.16b, v20.16b, v20.16b, #8
+ ldr q22, [x3, #160] //load h6l | h6h
+ ext v22.16b, v22.16b, v22.16b, #8
+ rev64 v9.16b, v9.16b //GHASH block 8k+1
+
+ rev32 v6.16b, v30.16b //CTR block 8k+14
+ add v30.4s, v30.4s, v31.4s //CTR block 8k+14
+ rev64 v8.16b, v8.16b //GHASH block 8k
+
+ rev64 v12.16b, v12.16b //GHASH block 8k+4
+ ext v19.16b, v19.16b, v19.16b, #8 //PRE 0
+ ldr q23, [x3, #176] //load h7l | h7h
+ ext v23.16b, v23.16b, v23.16b, #8
+ ldr q25, [x3, #208] //load h8l | h8h
+ ext v25.16b, v25.16b, v25.16b, #8
+
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 0
+ aese v5.16b, v26.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 0
+ rev32 v7.16b, v30.16b //CTR block 8k+15
+
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 0
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 0
+ aese v6.16b, v26.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 0
+
+ aese v7.16b, v26.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 0
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 0
+ aese v4.16b, v26.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 0
+
+ ldp q28, q26, [x8, #32] //load rk2, rk3
+ eor v8.16b, v8.16b, v19.16b //PRE 1
+ aese v6.16b, v27.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 1
+
+ aese v2.16b, v27.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 1
+ aese v1.16b, v27.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 1
+ aese v0.16b, v27.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 1
+
+ aese v4.16b, v27.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 1
+ aese v3.16b, v27.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 1
+ aese v5.16b, v27.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 1
+
+ pmull2 v17.1q, v8.2d, v25.2d //GHASH block 8k - high
+ pmull v19.1q, v8.1d, v25.1d //GHASH block 8k - low
+ pmull2 v16.1q, v9.2d, v23.2d //GHASH block 8k+1 - high
+
+ trn1 v18.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
+ trn2 v8.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
+ aese v7.16b, v27.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 1
+
+ aese v1.16b, v28.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 2
+ aese v5.16b, v28.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 2
+ aese v6.16b, v28.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 2
+
+ aese v2.16b, v28.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 2
+ pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low
+ aese v4.16b, v28.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 2
+
+ aese v5.16b, v26.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 3
+ aese v6.16b, v26.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 3
+ aese v0.16b, v28.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 2
+
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 3
+ aese v7.16b, v28.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 2
+ aese v3.16b, v28.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 2
+
+ aese v4.16b, v26.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 3
+ rev64 v14.16b, v14.16b //GHASH block 8k+6
+ pmull2 v9.1q, v11.2d, v20.2d //GHASH block 8k+3 - high
+
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 3
+ ldp q27, q28, [x8, #64] //load rk4, rk5
+ rev64 v10.16b, v10.16b //GHASH block 8k+2
+
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 3
+ aese v7.16b, v26.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 3
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 3
+
+ eor v17.16b, v17.16b, v16.16b //GHASH block 8k+1 - high
+ pmull2 v29.1q, v10.2d, v22.2d //GHASH block 8k+2 - high
+ rev64 v13.16b, v13.16b //GHASH block 8k+5
+
+ pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low
+ eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low
+ ldr q23, [x3, #80] //load h3l | h3h
+ ext v23.16b, v23.16b, v23.16b, #8
+ ldr q25, [x3, #112] //load h4l | h4h
+ ext v25.16b, v25.16b, v25.16b, #8
+
+ trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
+.inst 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b, v9.16b //GHASH block 8k+2, 8k+3 - high
+ pmull v22.1q, v10.1d, v22.1d //GHASH block 8k+2 - low
+
+ aese v4.16b, v27.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 4
+ aese v1.16b, v27.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 4
+ aese v5.16b, v27.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 4
+
+ aese v7.16b, v27.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 4
+ aese v3.16b, v27.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 4
+ aese v2.16b, v27.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 4
+
+ trn1 v29.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
+ aese v6.16b, v27.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 4
+ aese v0.16b, v27.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 4
+
+ trn2 v10.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
+ eor v8.16b, v8.16b, v18.16b //GHASH block 8k, 8k+1 - mid
+ ldp q26, q27, [x8, #96] //load rk6, rk7
+
+ aese v5.16b, v28.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 5
+ aese v7.16b, v28.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 5
+ aese v4.16b, v28.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 5
+
+ eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
+ aese v2.16b, v28.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 5
+ rev64 v15.16b, v15.16b //GHASH block 8k+7
+
+ aese v3.16b, v28.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 5
+ aese v6.16b, v28.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 5
+ aese v1.16b, v28.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 5
+
+ pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid
+ pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid
+ aese v0.16b, v28.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 5
+
+ pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid
+ aese v4.16b, v26.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 6
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 6
+
+ aese v6.16b, v26.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 6
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 6
+ aese v7.16b, v26.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 6
+
+ eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid
+ pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid
+ aese v5.16b, v26.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 6
+
+.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 6
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 6
+
+ ldp q28, q26, [x8, #128] //load rk8, rk9
+ pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high
+ aese v5.16b, v27.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 7
+
+ ldr q20, [x3, #32] //load h1l | h1h
+ ext v20.16b, v20.16b, v20.16b, #8
+ ldr q22, [x3, #64] //load h2l | h2h
+ ext v22.16b, v22.16b, v22.16b, #8
+ aese v2.16b, v27.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 7
+.inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
+
+ ldr q21, [x3, #48] //load h2k | h1k
+ ldr q24, [x3, #96] //load h4k | h3k
+ aese v6.16b, v27.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 7
+ aese v3.16b, v27.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 7
+
+ aese v0.16b, v27.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 7
+ aese v7.16b, v27.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 7
+ pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low
+
+ trn2 v12.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
+ aese v4.16b, v27.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 7
+ aese v1.16b, v27.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 7
+
+ pmull2 v10.1q, v13.2d, v23.2d //GHASH block 8k+5 - high
+ aese v7.16b, v28.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 8
+ aese v0.16b, v28.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 8
+
+ pmull v23.1q, v13.1d, v23.1d //GHASH block 8k+5 - low
+ trn1 v13.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
+ eor v12.16b, v12.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
+
+ aese v3.16b, v28.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 8
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 9
+ aese v1.16b, v28.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 8
+
+ pmull2 v16.1q, v12.2d, v24.2d //GHASH block 8k+4 - mid
+ pmull v24.1q, v12.1d, v24.1d //GHASH block 8k+5 - mid
+ aese v2.16b, v28.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 8
+
+ aese v5.16b, v28.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 8
+ pmull2 v11.1q, v14.2d, v22.2d //GHASH block 8k+6 - high
+ pmull v22.1q, v14.1d, v22.1d //GHASH block 8k+6 - low
+
+ aese v6.16b, v28.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 8
+ trn2 v14.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
+ aese v4.16b, v28.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 8
+
+.inst 0xce184252 //eor3 v18.16b, v18.16b, v24.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
+ aese v7.16b, v26.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 9
+ aese v5.16b, v26.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 9
+
+ eor v14.16b, v14.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
+ aese v6.16b, v26.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 9
+ aese v4.16b, v26.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 9
+
+ ldp q27, q28, [x8, #160] //load rk10, rk11
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 9
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 9
+
+ pmull2 v12.1q, v15.2d, v20.2d //GHASH block 8k+7 - high
+.inst 0xce195e73 //eor3 v19.16b, v19.16b, v25.16b, v23.16b //GHASH block 8k+4, 8k+5 - low
+ pmull v20.1q, v15.1d, v20.1d //GHASH block 8k+7 - low
+
+ ldr d16, [x10] //MODULO - load modulo constant
+ pmull2 v13.1q, v14.2d, v21.2d //GHASH block 8k+6 - mid
+ pmull v21.1q, v14.1d, v21.1d //GHASH block 8k+7 - mid
+
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 9
+
+.inst 0xce153652 //eor3 v18.16b, v18.16b, v21.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
+.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+6, 8k+7 - low
+.inst 0xce082a31 //eor3 v17.16b, v17.16b, v8.16b, v10.16b //GHASH block 8k+4, 8k+5 - high
+
+ aese v4.16b, v27.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 10
+ aese v3.16b, v27.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 10
+ aese v5.16b, v27.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 10
+
+ aese v0.16b, v27.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 10
+ aese v2.16b, v27.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 10
+ add v30.4s, v30.4s, v31.4s //CTR block 8k+15
+
+ aese v1.16b, v27.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 10
+ aese v7.16b, v27.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 10
+ aese v6.16b, v27.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 10
+
+.inst 0xce0b3231 //eor3 v17.16b, v17.16b, v11.16b, v12.16b //GHASH block 8k+6, 8k+7 - high
+
+ ldp q26, q27, [x8, #192] //load rk12, rk13
+ rev32 v20.16b, v30.16b //CTR block 8k+16
+
+ ext v21.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment
+ ldp q8, q9, [x0], #32 //AES block 8k+8, 8k+9 - load plaintext
+ aese v2.16b, v28.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 11
+
+ aese v6.16b, v28.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 11
+ add v30.4s, v30.4s, v31.4s //CTR block 8k+16
+ aese v3.16b, v28.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 11
+
+ aese v0.16b, v28.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 11
+ aese v7.16b, v28.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 11
+
+ pmull v29.1q, v17.1d, v16.1d //MODULO - top 64b align with mid
+ aese v1.16b, v28.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 11
+
+ aese v7.16b, v26.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 12
+ aese v5.16b, v28.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 11
+
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 12
+ aese v6.16b, v26.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 12
+ rev32 v22.16b, v30.16b //CTR block 8k+17
+
+ add v30.4s, v30.4s, v31.4s //CTR block 8k+17
+ aese v4.16b, v28.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 11
+.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up
+
+ aese v5.16b, v26.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 12
+ ldr q28, [x8, #224] //load rk14
+ aese v7.16b, v27.16b //AES block 8k+15 - round 13
+
+ ldp q10, q11, [x0], #32 //AES block 8k+10, 8k+11 - load plaintext
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 12
+ aese v4.16b, v26.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 12
+
+.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 12
+ ldp q12, q13, [x0], #32 //AES block 4, 5 - load plaintext
+
+ ldp q14, q15, [x0], #32 //AES block 6, 7 - load plaintext
+ aese v2.16b, v27.16b //AES block 8k+10 - round 13
+ aese v4.16b, v27.16b //AES block 8k+12 - round 13
+
+ rev32 v23.16b, v30.16b //CTR block 8k+18
+ add v30.4s, v30.4s, v31.4s //CTR block 8k+18
+ aese v5.16b, v27.16b //AES block 8k+13 - round 13
+
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 12
+ aese v3.16b, v27.16b //AES block 8k+11 - round 13
+ cmp x0, x5 //.LOOP CONTROL
+
+.inst 0xce02714a //eor3 v10.16b, v10.16b, v2.16b, v28.16b //AES block 8k+10 - result
+ rev32 v25.16b, v30.16b //CTR block 8k+19
+ add v30.4s, v30.4s, v31.4s //CTR block 8k+19
+
+ aese v0.16b, v27.16b //AES block 8k+8 - round 13
+ aese v6.16b, v27.16b //AES block 8k+14 - round 13
+.inst 0xce0571ad //eor3 v13.16b, v13.16b, v5.16b, v28.16b //AES block 5 - result
+
+ ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment
+ pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low
+ aese v1.16b, v27.16b //AES block 8k+9 - round 13
+
+.inst 0xce04718c //eor3 v12.16b, v12.16b, v4.16b, v28.16b //AES block 4 - result
+ rev32 v4.16b, v30.16b //CTR block 8k+20
+.inst 0xce03716b //eor3 v11.16b, v11.16b, v3.16b, v28.16b //AES block 8k+11 - result
+
+ mov v3.16b, v25.16b //CTR block 8k+19
+.inst 0xce017129 //eor3 v9.16b, v9.16b, v1.16b, v28.16b //AES block 8k+9 - result
+.inst 0xce007108 //eor3 v8.16b, v8.16b, v0.16b, v28.16b //AES block 8k+8 - result
+
+ add v30.4s, v30.4s, v31.4s //CTR block 8k+20
+ stp q8, q9, [x2], #32 //AES block 8k+8, 8k+9 - store result
+ mov v2.16b, v23.16b //CTR block 8k+18
+
+.inst 0xce0771ef //eor3 v15.16b, v15.16b, v7.16b, v28.16b //AES block 7 - result
+.inst 0xce154673 //eor3 v19.16b, v19.16b, v21.16b, v17.16b //MODULO - fold into low
+ stp q10, q11, [x2], #32 //AES block 8k+10, 8k+11 - store result
+
+.inst 0xce0671ce //eor3 v14.16b, v14.16b, v6.16b, v28.16b //AES block 6 - result
+ mov v1.16b, v22.16b //CTR block 8k+17
+ stp q12, q13, [x2], #32 //AES block 4, 5 - store result
+
+ stp q14, q15, [x2], #32 //AES block 6, 7 - store result
+ mov v0.16b, v20.16b //CTR block 8k+16
+ b.lt .L256_enc_main_loop
+
+.L256_enc_prepretail: //PREPRETAIL
+ rev32 v5.16b, v30.16b //CTR block 8k+13
+ ldp q26, q27, [x8, #0] //load rk0, rk1
+ add v30.4s, v30.4s, v31.4s //CTR block 8k+13
+
+ rev64 v10.16b, v10.16b //GHASH block 8k+2
+
+ rev32 v6.16b, v30.16b //CTR block 8k+14
+ add v30.4s, v30.4s, v31.4s //CTR block 8k+14
+
+ rev64 v13.16b, v13.16b //GHASH block 8k+5
+ ldr q21, [x3, #144] //load h6k | h5k
+ ldr q24, [x3, #192] //load h8k | h7k
+
+ rev32 v7.16b, v30.16b //CTR block 8k+15
+
+ aese v6.16b, v26.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 0
+ aese v4.16b, v26.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 0
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 0
+
+ aese v5.16b, v26.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 0
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 0
+
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 0
+ aese v7.16b, v26.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 0
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 0
+
+ ext v19.16b, v19.16b, v19.16b, #8 //PRE 0
+ rev64 v8.16b, v8.16b //GHASH block 8k
+ aese v1.16b, v27.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 1
+
+ rev64 v9.16b, v9.16b //GHASH block 8k+1
+ ldp q28, q26, [x8, #32] //load rk2, rk3
+ aese v3.16b, v27.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 1
+
+ ldr q23, [x3, #176] //load h7l | h7h
+ ext v23.16b, v23.16b, v23.16b, #8
+ ldr q25, [x3, #208] //load h8l | h8h
+ ext v25.16b, v25.16b, v25.16b, #8
+ aese v2.16b, v27.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 1
+
+ ldr q20, [x3, #128] //load h5l | h5h
+ ext v20.16b, v20.16b, v20.16b, #8
+ ldr q22, [x3, #160] //load h6l | h6h
+ ext v22.16b, v22.16b, v22.16b, #8
+ aese v0.16b, v27.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 1
+ aese v5.16b, v27.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 1
+
+ aese v4.16b, v27.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 1
+ eor v8.16b, v8.16b, v19.16b //PRE 1
+
+ rev64 v11.16b, v11.16b //GHASH block 8k+3
+ aese v6.16b, v27.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 1
+
+ aese v1.16b, v28.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 2
+ aese v2.16b, v28.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 2
+ aese v7.16b, v27.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 1
+
+ aese v4.16b, v28.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 2
+ aese v0.16b, v28.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 2
+ aese v6.16b, v28.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 2
+
+ aese v5.16b, v28.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 2
+ aese v7.16b, v28.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 2
+ aese v3.16b, v28.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 2
+
+ ldp q27, q28, [x8, #64] //load rk4, rk5
+ trn1 v18.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
+ pmull2 v17.1q, v8.2d, v25.2d //GHASH block 8k - high
+
+ rev64 v14.16b, v14.16b //GHASH block 8k+6
+ aese v4.16b, v26.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 3
+ pmull2 v16.1q, v9.2d, v23.2d //GHASH block 8k+1 - high
+
+ aese v7.16b, v26.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 3
+ pmull v19.1q, v8.1d, v25.1d //GHASH block 8k - low
+ trn2 v8.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
+
+ pmull2 v29.1q, v10.2d, v22.2d //GHASH block 8k+2 - high
+ aese v6.16b, v26.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 3
+
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 3
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 3
+ eor v17.16b, v17.16b, v16.16b //GHASH block 8k+1 - high
+
+ pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low
+ pmull2 v9.1q, v11.2d, v20.2d //GHASH block 8k+3 - high
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 3
+
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 3
+ eor v8.16b, v8.16b, v18.16b //GHASH block 8k, 8k+1 - mid
+ aese v5.16b, v26.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 3
+
+ pmull v22.1q, v10.1d, v22.1d //GHASH block 8k+2 - low
+ aese v1.16b, v27.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 4
+ aese v6.16b, v27.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 4
+
+ aese v0.16b, v27.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 4
+ aese v2.16b, v27.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 4
+ aese v4.16b, v27.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 4
+
+ aese v6.16b, v28.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 5
+ pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid
+.inst 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b, v9.16b //GHASH block 8k+2, 8k+3 - high
+
+ aese v7.16b, v27.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 4
+ trn1 v29.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
+ trn2 v10.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
+
+ aese v5.16b, v27.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 4
+ eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low
+ aese v3.16b, v27.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 4
+
+ pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low
+ pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid
+ eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
+
+ rev64 v12.16b, v12.16b //GHASH block 8k+4
+ aese v1.16b, v28.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 5
+ aese v0.16b, v28.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 5
+
+ aese v7.16b, v28.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 5
+ aese v4.16b, v28.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 5
+ ldp q26, q27, [x8, #96] //load rk6, rk7
+
+ ldr q23, [x3, #80] //load h3l | h3h
+ ext v23.16b, v23.16b, v23.16b, #8
+ ldr q25, [x3, #112] //load h4l | h4h
+ ext v25.16b, v25.16b, v25.16b, #8
+ pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid
+ pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid
+
+.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low
+ eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid
+
+ aese v5.16b, v28.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 5
+ rev64 v15.16b, v15.16b //GHASH block 8k+7
+ trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
+
+ aese v3.16b, v28.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 5
+ aese v2.16b, v28.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 5
+.inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
+
+ aese v7.16b, v26.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 6
+ aese v4.16b, v26.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 6
+ aese v6.16b, v26.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 6
+
+ ldr q21, [x3, #48] //load h2k | h1k
+ ldr q24, [x3, #96] //load h4k | h3k
+ aese v5.16b, v26.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 6
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 6
+
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 6
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 6
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 6
+
+ pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high
+ pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low
+ ldr q20, [x3, #32] //load h1l | h1h
+ ext v20.16b, v20.16b, v20.16b, #8
+ ldr q22, [x3, #64] //load h2l | h2h
+ ext v22.16b, v22.16b, v22.16b, #8
+
+ ldp q28, q26, [x8, #128] //load rk8, rk9
+ aese v1.16b, v27.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 7
+ aese v4.16b, v27.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 7
+
+ pmull2 v10.1q, v13.2d, v23.2d //GHASH block 8k+5 - high
+ trn2 v12.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
+
+ aese v5.16b, v27.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 7
+ aese v6.16b, v27.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 7
+ pmull v23.1q, v13.1d, v23.1d //GHASH block 8k+5 - low
+
+ aese v7.16b, v27.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 7
+ aese v3.16b, v27.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 7
+ eor v12.16b, v12.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
+
+ pmull2 v11.1q, v14.2d, v22.2d //GHASH block 8k+6 - high
+ pmull v22.1q, v14.1d, v22.1d //GHASH block 8k+6 - low
+ aese v2.16b, v27.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 7
+
+ trn1 v13.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
+ trn2 v14.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
+ aese v0.16b, v27.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 7
+
+ aese v7.16b, v28.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 8
+.inst 0xce195e73 //eor3 v19.16b, v19.16b, v25.16b, v23.16b //GHASH block 8k+4, 8k+5 - low
+ aese v2.16b, v28.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 8
+
+ aese v6.16b, v28.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 8
+ aese v4.16b, v28.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 8
+ aese v3.16b, v28.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 8
+
+ aese v5.16b, v28.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 8
+ eor v14.16b, v14.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
+ aese v0.16b, v28.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 8
+
+ pmull2 v16.1q, v12.2d, v24.2d //GHASH block 8k+4 - mid
+ pmull v24.1q, v12.1d, v24.1d //GHASH block 8k+5 - mid
+ aese v1.16b, v28.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 8
+
+ pmull2 v12.1q, v15.2d, v20.2d //GHASH block 8k+7 - high
+ pmull2 v13.1q, v14.2d, v21.2d //GHASH block 8k+6 - mid
+ pmull v21.1q, v14.1d, v21.1d //GHASH block 8k+7 - mid
+
+ pmull v20.1q, v15.1d, v20.1d //GHASH block 8k+7 - low
+.inst 0xce184252 //eor3 v18.16b, v18.16b, v24.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
+.inst 0xce082a31 //eor3 v17.16b, v17.16b, v8.16b, v10.16b //GHASH block 8k+4, 8k+5 - high
+
+ ldp q27, q28, [x8, #160] //load rk10, rk11
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 9
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 9
+
+.inst 0xce0b3231 //eor3 v17.16b, v17.16b, v11.16b, v12.16b //GHASH block 8k+6, 8k+7 - high
+.inst 0xce153652 //eor3 v18.16b, v18.16b, v21.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
+ ldr d16, [x10] //MODULO - load modulo constant
+
+.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+6, 8k+7 - low
+
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 9
+ aese v7.16b, v26.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 9
+ aese v5.16b, v26.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 9
+
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 9
+ aese v6.16b, v26.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 9
+
+ aese v5.16b, v27.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 10
+ aese v1.16b, v27.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 10
+ aese v4.16b, v26.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 9
+
+ aese v7.16b, v27.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 10
+ aese v6.16b, v27.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 10
+ aese v3.16b, v27.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 10
+
+ aese v4.16b, v27.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 10
+ aese v0.16b, v27.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 10
+ aese v2.16b, v27.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 10
+
+ pmull v29.1q, v17.1d, v16.1d //MODULO - top 64b align with mid
+.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up
+ aese v7.16b, v28.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 11
+
+ ldp q26, q27, [x8, #192] //load rk12, rk13
+ ext v21.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment
+ aese v2.16b, v28.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 11
+
+.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid
+ aese v1.16b, v28.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 11
+ aese v6.16b, v28.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 11
+
+ aese v0.16b, v28.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 11
+ aese v4.16b, v28.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 11
+ aese v5.16b, v28.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 11
+
+ pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low
+ aese v3.16b, v28.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 11
+ ldr q28, [x8, #224] //load rk14
+
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 12
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 12
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 12
+
+ aese v6.16b, v26.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 12
+ aese v5.16b, v26.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 12
+ ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment
+
+ aese v4.16b, v26.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 12
+ add v30.4s, v30.4s, v31.4s //CTR block 8k+15
+
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 12
+ aese v7.16b, v26.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 12
+ aese v0.16b, v27.16b //AES block 8k+8 - round 13
+
+.inst 0xce154673 //eor3 v19.16b, v19.16b, v21.16b, v17.16b //MODULO - fold into low
+ aese v5.16b, v27.16b //AES block 8k+13 - round 13
+ aese v1.16b, v27.16b //AES block 8k+9 - round 13
+
+ aese v3.16b, v27.16b //AES block 8k+11 - round 13
+ aese v4.16b, v27.16b //AES block 8k+12 - round 13
+ aese v7.16b, v27.16b //AES block 8k+15 - round 13
+
+ aese v2.16b, v27.16b //AES block 8k+10 - round 13
+ aese v6.16b, v27.16b //AES block 8k+14 - round 13
+.L256_enc_tail: //TAIL
+
+ ldp q24, q25, [x3, #192] //load h8l | h8h
+ ext v25.16b, v25.16b, v25.16b, #8
+ sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process
+
+ ldr q8, [x0], #16 //AES block 8k+8 - load plaintext
+
+ ldp q20, q21, [x3, #128] //load h5l | h5h
+ ext v20.16b, v20.16b, v20.16b, #8
+
+ ext v16.16b, v19.16b, v19.16b, #8 //prepare final partial tag
+ ldp q22, q23, [x3, #160] //load h6l | h6h
+ ext v22.16b, v22.16b, v22.16b, #8
+ ext v23.16b, v23.16b, v23.16b, #8
+ mov v29.16b, v28.16b
+
+ cmp x5, #112
+.inst 0xce007509 //eor3 v9.16b, v8.16b, v0.16b, v29.16b //AES block 8k+8 - result
+ b.gt .L256_enc_blocks_more_than_7
+
+ movi v19.8b, #0
+ mov v7.16b, v6.16b
+ movi v17.8b, #0
+
+ mov v6.16b, v5.16b
+ mov v5.16b, v4.16b
+ mov v4.16b, v3.16b
+
+ mov v3.16b, v2.16b
+ sub v30.4s, v30.4s, v31.4s
+ mov v2.16b, v1.16b
+
+ movi v18.8b, #0
+ cmp x5, #96
+ b.gt .L256_enc_blocks_more_than_6
+
+ mov v7.16b, v6.16b
+ mov v6.16b, v5.16b
+ cmp x5, #80
+
+ mov v5.16b, v4.16b
+ mov v4.16b, v3.16b
+ mov v3.16b, v1.16b
+
+ sub v30.4s, v30.4s, v31.4s
+ b.gt .L256_enc_blocks_more_than_5
+
+ mov v7.16b, v6.16b
+ sub v30.4s, v30.4s, v31.4s
+
+ mov v6.16b, v5.16b
+ mov v5.16b, v4.16b
+
+ cmp x5, #64
+ mov v4.16b, v1.16b
+ b.gt .L256_enc_blocks_more_than_4
+
+ cmp x5, #48
+ mov v7.16b, v6.16b
+ mov v6.16b, v5.16b
+
+ mov v5.16b, v1.16b
+ sub v30.4s, v30.4s, v31.4s
+ b.gt .L256_enc_blocks_more_than_3
+
+ cmp x5, #32
+ mov v7.16b, v6.16b
+ ldr q24, [x3, #96] //load h4k | h3k
+
+ mov v6.16b, v1.16b
+ sub v30.4s, v30.4s, v31.4s
+ b.gt .L256_enc_blocks_more_than_2
+
+ mov v7.16b, v1.16b
+
+ sub v30.4s, v30.4s, v31.4s
+ cmp x5, #16
+ b.gt .L256_enc_blocks_more_than_1
+
+ sub v30.4s, v30.4s, v31.4s
+ ldr q21, [x3, #48] //load h2k | h1k
+ b .L256_enc_blocks_less_than_1
+.L256_enc_blocks_more_than_7: //blocks left > 7
+ st1 { v9.16b}, [x2], #16 //AES final-7 block - store result
+
+ rev64 v8.16b, v9.16b //GHASH final-7 block
+
+ eor v8.16b, v8.16b, v16.16b //feed in partial tag
+
+ ldr q9, [x0], #16 //AES final-6 block - load plaintext
+
+ pmull2 v17.1q, v8.2d, v25.2d //GHASH final-7 block - high
+ ins v27.d[0], v8.d[1] //GHASH final-7 block - mid
+ ins v18.d[0], v24.d[1] //GHASH final-7 block - mid
+
+ movi v16.8b, #0 //suppress further partial tag feed in
+
+ eor v27.8b, v27.8b, v8.8b //GHASH final-7 block - mid
+.inst 0xce017529 //eor3 v9.16b, v9.16b, v1.16b, v29.16b //AES final-6 block - result
+
+ pmull v18.1q, v27.1d, v18.1d //GHASH final-7 block - mid
+ pmull v19.1q, v8.1d, v25.1d //GHASH final-7 block - low
+.L256_enc_blocks_more_than_6: //blocks left > 6
+
+ st1 { v9.16b}, [x2], #16 //AES final-6 block - store result
+
+ rev64 v8.16b, v9.16b //GHASH final-6 block
+
+ eor v8.16b, v8.16b, v16.16b //feed in partial tag
+
+ pmull v26.1q, v8.1d, v23.1d //GHASH final-6 block - low
+ ins v27.d[0], v8.d[1] //GHASH final-6 block - mid
+ pmull2 v28.1q, v8.2d, v23.2d //GHASH final-6 block - high
+
+ ldr q9, [x0], #16 //AES final-5 block - load plaintext
+
+ eor v19.16b, v19.16b, v26.16b //GHASH final-6 block - low
+
+ eor v27.8b, v27.8b, v8.8b //GHASH final-6 block - mid
+
+ pmull v27.1q, v27.1d, v24.1d //GHASH final-6 block - mid
+.inst 0xce027529 //eor3 v9.16b, v9.16b, v2.16b, v29.16b //AES final-5 block - result
+
+ movi v16.8b, #0 //suppress further partial tag feed in
+
+ eor v18.16b, v18.16b, v27.16b //GHASH final-6 block - mid
+ eor v17.16b, v17.16b, v28.16b //GHASH final-6 block - high
+.L256_enc_blocks_more_than_5: //blocks left > 5
+
+ st1 { v9.16b}, [x2], #16 //AES final-5 block - store result
+
+ rev64 v8.16b, v9.16b //GHASH final-5 block
+
+ eor v8.16b, v8.16b, v16.16b //feed in partial tag
+
+ ins v27.d[0], v8.d[1] //GHASH final-5 block - mid
+
+ pmull2 v28.1q, v8.2d, v22.2d //GHASH final-5 block - high
+
+ eor v17.16b, v17.16b, v28.16b //GHASH final-5 block - high
+ eor v27.8b, v27.8b, v8.8b //GHASH final-5 block - mid
+
+ ins v27.d[1], v27.d[0] //GHASH final-5 block - mid
+
+ ldr q9, [x0], #16 //AES final-4 block - load plaintext
+ pmull v26.1q, v8.1d, v22.1d //GHASH final-5 block - low
+
+ pmull2 v27.1q, v27.2d, v21.2d //GHASH final-5 block - mid
+ movi v16.8b, #0 //suppress further partial tag feed in
+ eor v19.16b, v19.16b, v26.16b //GHASH final-5 block - low
+
+ eor v18.16b, v18.16b, v27.16b //GHASH final-5 block - mid
+.inst 0xce037529 //eor3 v9.16b, v9.16b, v3.16b, v29.16b //AES final-4 block - result
+.L256_enc_blocks_more_than_4: //blocks left > 4
+
+ st1 { v9.16b}, [x2], #16 //AES final-4 block - store result
+
+ rev64 v8.16b, v9.16b //GHASH final-4 block
+
+ ldr q9, [x0], #16 //AES final-3 block - load plaintext
+
+ eor v8.16b, v8.16b, v16.16b //feed in partial tag
+
+ ins v27.d[0], v8.d[1] //GHASH final-4 block - mid
+ pmull2 v28.1q, v8.2d, v20.2d //GHASH final-4 block - high
+
+.inst 0xce047529 //eor3 v9.16b, v9.16b, v4.16b, v29.16b //AES final-3 block - result
+ pmull v26.1q, v8.1d, v20.1d //GHASH final-4 block - low
+
+ eor v27.8b, v27.8b, v8.8b //GHASH final-4 block - mid
+ eor v19.16b, v19.16b, v26.16b //GHASH final-4 block - low
+
+ pmull v27.1q, v27.1d, v21.1d //GHASH final-4 block - mid
+
+ movi v16.8b, #0 //suppress further partial tag feed in
+
+ eor v18.16b, v18.16b, v27.16b //GHASH final-4 block - mid
+ eor v17.16b, v17.16b, v28.16b //GHASH final-4 block - high
+.L256_enc_blocks_more_than_3: //blocks left > 3
+
+ st1 { v9.16b}, [x2], #16 //AES final-3 block - store result
+
+ ldr q25, [x3, #112] //load h4l | h4h
+ ext v25.16b, v25.16b, v25.16b, #8
+ rev64 v8.16b, v9.16b //GHASH final-3 block
+
+ eor v8.16b, v8.16b, v16.16b //feed in partial tag
+
+ ins v27.d[0], v8.d[1] //GHASH final-3 block - mid
+ pmull2 v28.1q, v8.2d, v25.2d //GHASH final-3 block - high
+
+ eor v17.16b, v17.16b, v28.16b //GHASH final-3 block - high
+ eor v27.8b, v27.8b, v8.8b //GHASH final-3 block - mid
+ ldr q24, [x3, #96] //load h4k | h3k
+
+ ins v27.d[1], v27.d[0] //GHASH final-3 block - mid
+ ldr q9, [x0], #16 //AES final-2 block - load plaintext
+
+ pmull2 v27.1q, v27.2d, v24.2d //GHASH final-3 block - mid
+ pmull v26.1q, v8.1d, v25.1d //GHASH final-3 block - low
+
+.inst 0xce057529 //eor3 v9.16b, v9.16b, v5.16b, v29.16b //AES final-2 block - result
+ movi v16.8b, #0 //suppress further partial tag feed in
+
+ eor v18.16b, v18.16b, v27.16b //GHASH final-3 block - mid
+ eor v19.16b, v19.16b, v26.16b //GHASH final-3 block - low
+.L256_enc_blocks_more_than_2: //blocks left > 2
+
+ ldr q23, [x3, #80] //load h3l | h3h
+ ext v23.16b, v23.16b, v23.16b, #8
+
+ st1 { v9.16b}, [x2], #16 //AES final-2 block - store result
+
+ rev64 v8.16b, v9.16b //GHASH final-2 block
+ ldr q9, [x0], #16 //AES final-1 block - load plaintext
+
+ eor v8.16b, v8.16b, v16.16b //feed in partial tag
+
+ ins v27.d[0], v8.d[1] //GHASH final-2 block - mid
+
+ movi v16.8b, #0 //suppress further partial tag feed in
+
+ pmull2 v28.1q, v8.2d, v23.2d //GHASH final-2 block - high
+.inst 0xce067529 //eor3 v9.16b, v9.16b, v6.16b, v29.16b //AES final-1 block - result
+
+ eor v27.8b, v27.8b, v8.8b //GHASH final-2 block - mid
+
+ eor v17.16b, v17.16b, v28.16b //GHASH final-2 block - high
+
+ pmull v27.1q, v27.1d, v24.1d //GHASH final-2 block - mid
+ pmull v26.1q, v8.1d, v23.1d //GHASH final-2 block - low
+
+ eor v18.16b, v18.16b, v27.16b //GHASH final-2 block - mid
+ eor v19.16b, v19.16b, v26.16b //GHASH final-2 block - low
+.L256_enc_blocks_more_than_1: //blocks left > 1
+
+ st1 { v9.16b}, [x2], #16 //AES final-1 block - store result
+
+ ldr q22, [x3, #64] //load h2l | h2h
+ ext v22.16b, v22.16b, v22.16b, #8
+ rev64 v8.16b, v9.16b //GHASH final-1 block
+ ldr q9, [x0], #16 //AES final block - load plaintext
+
+ eor v8.16b, v8.16b, v16.16b //feed in partial tag
+ movi v16.8b, #0 //suppress further partial tag feed in
+
+ ins v27.d[0], v8.d[1] //GHASH final-1 block - mid
+ pmull2 v28.1q, v8.2d, v22.2d //GHASH final-1 block - high
+
+.inst 0xce077529 //eor3 v9.16b, v9.16b, v7.16b, v29.16b //AES final block - result
+ eor v17.16b, v17.16b, v28.16b //GHASH final-1 block - high
+
+ pmull v26.1q, v8.1d, v22.1d //GHASH final-1 block - low
+ eor v27.8b, v27.8b, v8.8b //GHASH final-1 block - mid
+
+ ldr q21, [x3, #48] //load h2k | h1k
+
+ eor v19.16b, v19.16b, v26.16b //GHASH final-1 block - low
+ ins v27.d[1], v27.d[0] //GHASH final-1 block - mid
+
+ pmull2 v27.1q, v27.2d, v21.2d //GHASH final-1 block - mid
+
+ eor v18.16b, v18.16b, v27.16b //GHASH final-1 block - mid
+.L256_enc_blocks_less_than_1: //blocks left <= 1
+
+ and x1, x1, #127 //bit_length %= 128
+
+ sub x1, x1, #128 //bit_length -= 128
+
+ neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128])
+
+ mvn x6, xzr //temp0_x = 0xffffffffffffffff
+ and x1, x1, #127 //bit_length %= 128
+
+ lsr x6, x6, x1 //temp0_x is mask for top 64b of last block
+ cmp x1, #64
+ mvn x7, xzr //temp1_x = 0xffffffffffffffff
+
+ csel x14, x6, xzr, lt
+ csel x13, x7, x6, lt
+
+ mov v0.d[0], x13 //ctr0b is mask for last block
+ ldr q20, [x3, #32] //load h1l | h1h
+ ext v20.16b, v20.16b, v20.16b, #8
+
+ ld1 { v26.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored
+ mov v0.d[1], x14
+
+ and v9.16b, v9.16b, v0.16b //possibly partial last block has zeroes in highest bits
+
+ rev64 v8.16b, v9.16b //GHASH final block
+
+ rev32 v30.16b, v30.16b
+ bif v9.16b, v26.16b, v0.16b //insert existing bytes in top end of result before storing
+ str q30, [x16] //store the updated counter
+
+ eor v8.16b, v8.16b, v16.16b //feed in partial tag
+ st1 { v9.16b}, [x2] //store all 16B
+
+ ins v16.d[0], v8.d[1] //GHASH final block - mid
+ pmull2 v28.1q, v8.2d, v20.2d //GHASH final block - high
+ pmull v26.1q, v8.1d, v20.1d //GHASH final block - low
+
+ eor v17.16b, v17.16b, v28.16b //GHASH final block - high
+ eor v19.16b, v19.16b, v26.16b //GHASH final block - low
+
+ eor v16.8b, v16.8b, v8.8b //GHASH final block - mid
+
+ pmull v16.1q, v16.1d, v21.1d //GHASH final block - mid
+
+ eor v18.16b, v18.16b, v16.16b //GHASH final block - mid
+ ldr d16, [x10] //MODULO - load modulo constant
+
+ ext v21.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment
+
+.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up
+ pmull v29.1q, v17.1d, v16.1d //MODULO - top 64b align with mid
+
+.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid
+
+ pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low
+ ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment
+
+.inst 0xce115673 //eor3 v19.16b, v19.16b, v17.16b, v21.16b //MODULO - fold into low
+ ext v19.16b, v19.16b, v19.16b, #8
+ rev64 v19.16b, v19.16b
+ st1 { v19.16b }, [x3]
+ mov x0, x9 //return sizes
+
+ ldp d10, d11, [sp, #16]
+ ldp d12, d13, [sp, #32]
+ ldp d14, d15, [sp, #48]
+ ldp d8, d9, [sp], #80
+ ret
+
+.L256_enc_ret:
+ mov w0, #0x0
+ ret
+.size unroll8_eor3_aes_gcm_enc_256_kernel,.-unroll8_eor3_aes_gcm_enc_256_kernel
+.globl unroll8_eor3_aes_gcm_dec_256_kernel
+.type unroll8_eor3_aes_gcm_dec_256_kernel,%function
+.align 4
+unroll8_eor3_aes_gcm_dec_256_kernel:
+ AARCH64_VALID_CALL_TARGET
+ cbz x1, .L256_dec_ret
+ stp d8, d9, [sp, #-80]!
+ lsr x9, x1, #3
+ mov x16, x4
+ mov x8, x5
+ stp d10, d11, [sp, #16]
+ stp d12, d13, [sp, #32]
+ stp d14, d15, [sp, #48]
+ mov x5, #0xc200000000000000
+ stp x5, xzr, [sp, #64]
+ add x10, sp, #64
+
+ ld1 { v0.16b}, [x16] //CTR block 0
+
+ mov x15, #0x100000000 //set up counter increment
+ movi v31.16b, #0x0
+ mov v31.d[1], x15
+ mov x5, x9
+
+ sub x5, x5, #1 //byte_len - 1
+
+ rev32 v30.16b, v0.16b //set up reversed counter
+
+ add v30.4s, v30.4s, v31.4s //CTR block 0
+
+ rev32 v1.16b, v30.16b //CTR block 1
+ add v30.4s, v30.4s, v31.4s //CTR block 1
+
+ rev32 v2.16b, v30.16b //CTR block 2
+ add v30.4s, v30.4s, v31.4s //CTR block 2
+ ldp q26, q27, [x8, #0] //load rk0, rk1
+
+ rev32 v3.16b, v30.16b //CTR block 3
+ add v30.4s, v30.4s, v31.4s //CTR block 3
+
+ rev32 v4.16b, v30.16b //CTR block 4
+ add v30.4s, v30.4s, v31.4s //CTR block 4
+
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b //AES block 0 - round 0
+
+ rev32 v5.16b, v30.16b //CTR block 5
+ add v30.4s, v30.4s, v31.4s //CTR block 5
+
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b //AES block 1 - round 0
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b //AES block 2 - round 0
+
+ rev32 v6.16b, v30.16b //CTR block 6
+ add v30.4s, v30.4s, v31.4s //CTR block 6
+
+ rev32 v7.16b, v30.16b //CTR block 7
+ aese v4.16b, v26.16b
+ aesmc v4.16b, v4.16b //AES block 4 - round 0
+
+ aese v6.16b, v26.16b
+ aesmc v6.16b, v6.16b //AES block 6 - round 0
+ aese v5.16b, v26.16b
+ aesmc v5.16b, v5.16b //AES block 5 - round 0
+
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b //AES block 3 - round 0
+ aese v7.16b, v26.16b
+ aesmc v7.16b, v7.16b //AES block 7 - round 0
+ ldp q28, q26, [x8, #32] //load rk2, rk3
+
+ aese v6.16b, v27.16b
+ aesmc v6.16b, v6.16b //AES block 6 - round 1
+ aese v4.16b, v27.16b
+ aesmc v4.16b, v4.16b //AES block 4 - round 1
+ aese v0.16b, v27.16b
+ aesmc v0.16b, v0.16b //AES block 0 - round 1
+
+ aese v5.16b, v27.16b
+ aesmc v5.16b, v5.16b //AES block 5 - round 1
+ aese v7.16b, v27.16b
+ aesmc v7.16b, v7.16b //AES block 7 - round 1
+ aese v1.16b, v27.16b
+ aesmc v1.16b, v1.16b //AES block 1 - round 1
+
+ aese v2.16b, v27.16b
+ aesmc v2.16b, v2.16b //AES block 2 - round 1
+ aese v3.16b, v27.16b
+ aesmc v3.16b, v3.16b //AES block 3 - round 1
+
+ aese v3.16b, v28.16b
+ aesmc v3.16b, v3.16b //AES block 3 - round 2
+ aese v2.16b, v28.16b
+ aesmc v2.16b, v2.16b //AES block 2 - round 2
+ aese v6.16b, v28.16b
+ aesmc v6.16b, v6.16b //AES block 6 - round 2
+
+ aese v1.16b, v28.16b
+ aesmc v1.16b, v1.16b //AES block 1 - round 2
+ aese v7.16b, v28.16b
+ aesmc v7.16b, v7.16b //AES block 7 - round 2
+ aese v5.16b, v28.16b
+ aesmc v5.16b, v5.16b //AES block 5 - round 2
+
+ aese v0.16b, v28.16b
+ aesmc v0.16b, v0.16b //AES block 0 - round 2
+ aese v4.16b, v28.16b
+ aesmc v4.16b, v4.16b //AES block 4 - round 2
+ ldp q27, q28, [x8, #64] //load rk4, rk5
+
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b //AES block 1 - round 3
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b //AES block 2 - round 3
+
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b //AES block 3 - round 3
+ aese v4.16b, v26.16b
+ aesmc v4.16b, v4.16b //AES block 4 - round 3
+
+ aese v5.16b, v26.16b
+ aesmc v5.16b, v5.16b //AES block 5 - round 3
+ aese v7.16b, v26.16b
+ aesmc v7.16b, v7.16b //AES block 7 - round 3
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b //AES block 0 - round 3
+
+ aese v6.16b, v26.16b
+ aesmc v6.16b, v6.16b //AES block 6 - round 3
+
+ aese v7.16b, v27.16b
+ aesmc v7.16b, v7.16b //AES block 7 - round 4
+ aese v3.16b, v27.16b
+ aesmc v3.16b, v3.16b //AES block 3 - round 4
+
+ aese v6.16b, v27.16b
+ aesmc v6.16b, v6.16b //AES block 6 - round 4
+ aese v2.16b, v27.16b
+ aesmc v2.16b, v2.16b //AES block 2 - round 4
+ aese v0.16b, v27.16b
+ aesmc v0.16b, v0.16b //AES block 0 - round 4
+
+ aese v4.16b, v27.16b
+ aesmc v4.16b, v4.16b //AES block 4 - round 4
+ aese v1.16b, v27.16b
+ aesmc v1.16b, v1.16b //AES block 1 - round 4
+ aese v5.16b, v27.16b
+ aesmc v5.16b, v5.16b //AES block 5 - round 4
+
+ aese v0.16b, v28.16b
+ aesmc v0.16b, v0.16b //AES block 0 - round 5
+ aese v6.16b, v28.16b
+ aesmc v6.16b, v6.16b //AES block 6 - round 5
+
+ ldp q26, q27, [x8, #96] //load rk6, rk7
+ aese v4.16b, v28.16b
+ aesmc v4.16b, v4.16b //AES block 4 - round 5
+ aese v7.16b, v28.16b
+ aesmc v7.16b, v7.16b //AES block 7 - round 5
+
+ aese v5.16b, v28.16b
+ aesmc v5.16b, v5.16b //AES block 5 - round 5
+
+ aese v2.16b, v28.16b
+ aesmc v2.16b, v2.16b //AES block 2 - round 5
+ aese v3.16b, v28.16b
+ aesmc v3.16b, v3.16b //AES block 3 - round 5
+
+ aese v1.16b, v28.16b
+ aesmc v1.16b, v1.16b //AES block 1 - round 5
+
+ aese v4.16b, v26.16b
+ aesmc v4.16b, v4.16b //AES block 4 - round 6
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b //AES block 3 - round 6
+ aese v7.16b, v26.16b
+ aesmc v7.16b, v7.16b //AES block 7 - round 6
+
+ aese v6.16b, v26.16b
+ aesmc v6.16b, v6.16b //AES block 6 - round 6
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b //AES block 0 - round 6
+ aese v5.16b, v26.16b
+ aesmc v5.16b, v5.16b //AES block 5 - round 6
+
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b //AES block 2 - round 6
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b //AES block 1 - round 6
+ ldp q28, q26, [x8, #128] //load rk8, rk9
+
+ aese v5.16b, v27.16b
+ aesmc v5.16b, v5.16b //AES block 5 - round 7
+ aese v0.16b, v27.16b
+ aesmc v0.16b, v0.16b //AES block 0 - round 7
+
+ aese v3.16b, v27.16b
+ aesmc v3.16b, v3.16b //AES block 3 - round 7
+ aese v2.16b, v27.16b
+ aesmc v2.16b, v2.16b //AES block 2 - round 7
+ aese v7.16b, v27.16b
+ aesmc v7.16b, v7.16b //AES block 7 - round 7
+
+ aese v4.16b, v27.16b
+ aesmc v4.16b, v4.16b //AES block 4 - round 7
+ aese v1.16b, v27.16b
+ aesmc v1.16b, v1.16b //AES block 1 - round 7
+ aese v6.16b, v27.16b
+ aesmc v6.16b, v6.16b //AES block 6 - round 7
+
+ and x5, x5, #0xffffffffffffff80 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
+ aese v7.16b, v28.16b
+ aesmc v7.16b, v7.16b //AES block 7 - round 8
+ aese v5.16b, v28.16b
+ aesmc v5.16b, v5.16b //AES block 5 - round 8
+
+ aese v0.16b, v28.16b
+ aesmc v0.16b, v0.16b //AES block 0 - round 8
+ aese v1.16b, v28.16b
+ aesmc v1.16b, v1.16b //AES block 1 - round 8
+ aese v2.16b, v28.16b
+ aesmc v2.16b, v2.16b //AES block 2 - round 8
+
+ aese v4.16b, v28.16b
+ aesmc v4.16b, v4.16b //AES block 4 - round 8
+ aese v3.16b, v28.16b
+ aesmc v3.16b, v3.16b //AES block 3 - round 8
+ aese v6.16b, v28.16b
+ aesmc v6.16b, v6.16b //AES block 6 - round 8
+
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b //AES block 2 - round 9
+
+ ld1 { v19.16b}, [x3]
+ ext v19.16b, v19.16b, v19.16b, #8
+ rev64 v19.16b, v19.16b
+ ldp q27, q28, [x8, #160] //load rk10, rk11
+ add x4, x0, x1, lsr #3 //end_input_ptr
+ add x5, x5, x0
+
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b //AES block 3 - round 9
+ aese v6.16b, v26.16b
+ aesmc v6.16b, v6.16b //AES block 6 - round 9
+
+ aese v4.16b, v26.16b
+ aesmc v4.16b, v4.16b //AES block 4 - round 9
+ aese v5.16b, v26.16b
+ aesmc v5.16b, v5.16b //AES block 5 - round 9
+
+ aese v7.16b, v26.16b
+ aesmc v7.16b, v7.16b //AES block 7 - round 9
+
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b //AES block 0 - round 9
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b //AES block 1 - round 9
+
+ aese v4.16b, v27.16b
+ aesmc v4.16b, v4.16b //AES block 4 - round 10
+ aese v7.16b, v27.16b
+ aesmc v7.16b, v7.16b //AES block 7 - round 10
+ aese v5.16b, v27.16b
+ aesmc v5.16b, v5.16b //AES block 5 - round 10
+
+ aese v1.16b, v27.16b
+ aesmc v1.16b, v1.16b //AES block 1 - round 10
+ aese v2.16b, v27.16b
+ aesmc v2.16b, v2.16b //AES block 2 - round 10
+ aese v0.16b, v27.16b
+ aesmc v0.16b, v0.16b //AES block 0 - round 10
+
+ aese v6.16b, v27.16b
+ aesmc v6.16b, v6.16b //AES block 6 - round 10
+ aese v3.16b, v27.16b
+ aesmc v3.16b, v3.16b //AES block 3 - round 10
+ ldp q26, q27, [x8, #192] //load rk12, rk13
+
+ aese v0.16b, v28.16b
+ aesmc v0.16b, v0.16b //AES block 0 - round 11
+ add v30.4s, v30.4s, v31.4s //CTR block 7
+
+ aese v7.16b, v28.16b
+ aesmc v7.16b, v7.16b //AES block 7 - round 11
+ aese v3.16b, v28.16b
+ aesmc v3.16b, v3.16b //AES block 3 - round 11
+ aese v1.16b, v28.16b
+ aesmc v1.16b, v1.16b //AES block 1 - round 11
+
+ aese v5.16b, v28.16b
+ aesmc v5.16b, v5.16b //AES block 5 - round 11
+ aese v4.16b, v28.16b
+ aesmc v4.16b, v4.16b //AES block 4 - round 11
+ aese v2.16b, v28.16b
+ aesmc v2.16b, v2.16b //AES block 2 - round 11
+
+ aese v6.16b, v28.16b
+ aesmc v6.16b, v6.16b //AES block 6 - round 11
+ ldr q28, [x8, #224] //load rk14
+
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b //AES block 1 - round 12
+ aese v4.16b, v26.16b
+ aesmc v4.16b, v4.16b //AES block 4 - round 12
+ aese v5.16b, v26.16b
+ aesmc v5.16b, v5.16b //AES block 5 - round 12
+
+ cmp x0, x5 //check if we have <= 8 blocks
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b //AES block 3 - round 12
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b //AES block 2 - round 12
+
+ aese v6.16b, v26.16b
+ aesmc v6.16b, v6.16b //AES block 6 - round 12
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b //AES block 0 - round 12
+ aese v7.16b, v26.16b
+ aesmc v7.16b, v7.16b //AES block 7 - round 12
+
+ aese v5.16b, v27.16b //AES block 5 - round 13
+ aese v1.16b, v27.16b //AES block 1 - round 13
+ aese v2.16b, v27.16b //AES block 2 - round 13
+
+ aese v0.16b, v27.16b //AES block 0 - round 13
+ aese v4.16b, v27.16b //AES block 4 - round 13
+ aese v6.16b, v27.16b //AES block 6 - round 13
+
+ aese v3.16b, v27.16b //AES block 3 - round 13
+ aese v7.16b, v27.16b //AES block 7 - round 13
+ b.ge .L256_dec_tail //handle tail
+
+ ldp q8, q9, [x0], #32 //AES block 0, 1 - load ciphertext
+
+ ldp q10, q11, [x0], #32 //AES block 2, 3 - load ciphertext
+
+ ldp q12, q13, [x0], #32 //AES block 4, 5 - load ciphertext
+
+ ldp q14, q15, [x0], #32 //AES block 6, 7 - load ciphertext
+ cmp x0, x5 //check if we have <= 8 blocks
+
+.inst 0xce017121 //eor3 v1.16b, v9.16b, v1.16b, v28.16b //AES block 1 - result
+.inst 0xce007100 //eor3 v0.16b, v8.16b, v0.16b, v28.16b //AES block 0 - result
+ stp q0, q1, [x2], #32 //AES block 0, 1 - store result
+
+ rev32 v0.16b, v30.16b //CTR block 8
+ add v30.4s, v30.4s, v31.4s //CTR block 8
+.inst 0xce037163 //eor3 v3.16b, v11.16b, v3.16b, v28.16b //AES block 3 - result
+
+.inst 0xce0571a5 //eor3 v5.16b, v13.16b, v5.16b, v28.16b //AES block 5 - result
+
+.inst 0xce047184 //eor3 v4.16b, v12.16b, v4.16b, v28.16b //AES block 4 - result
+ rev32 v1.16b, v30.16b //CTR block 9
+ add v30.4s, v30.4s, v31.4s //CTR block 9
+
+.inst 0xce027142 //eor3 v2.16b, v10.16b, v2.16b, v28.16b //AES block 2 - result
+ stp q2, q3, [x2], #32 //AES block 2, 3 - store result
+
+ rev32 v2.16b, v30.16b //CTR block 10
+ add v30.4s, v30.4s, v31.4s //CTR block 10
+
+.inst 0xce0671c6 //eor3 v6.16b, v14.16b, v6.16b, v28.16b //AES block 6 - result
+
+ rev32 v3.16b, v30.16b //CTR block 11
+ add v30.4s, v30.4s, v31.4s //CTR block 11
+ stp q4, q5, [x2], #32 //AES block 4, 5 - store result
+
+.inst 0xce0771e7 //eor3 v7.16b, v15.16b, v7.16b, v28.16b //AES block 7 - result
+ stp q6, q7, [x2], #32 //AES block 6, 7 - store result
+
+ rev32 v4.16b, v30.16b //CTR block 12
+ add v30.4s, v30.4s, v31.4s //CTR block 12
+ b.ge .L256_dec_prepretail //do prepretail
+
+.L256_dec_main_loop: //main loop start
+ rev32 v5.16b, v30.16b //CTR block 8k+13
+ ldp q26, q27, [x8, #0] //load rk0, rk1
+ add v30.4s, v30.4s, v31.4s //CTR block 8k+13
+
+ rev64 v9.16b, v9.16b //GHASH block 8k+1
+ ldr q23, [x3, #176] //load h7l | h7h
+ ext v23.16b, v23.16b, v23.16b, #8
+ ldr q25, [x3, #208] //load h8l | h8h
+ ext v25.16b, v25.16b, v25.16b, #8
+
+ rev32 v6.16b, v30.16b //CTR block 8k+14
+ add v30.4s, v30.4s, v31.4s //CTR block 8k+14
+ rev64 v8.16b, v8.16b //GHASH block 8k
+
+ ext v19.16b, v19.16b, v19.16b, #8 //PRE 0
+ rev64 v12.16b, v12.16b //GHASH block 8k+4
+ rev64 v11.16b, v11.16b //GHASH block 8k+3
+
+ rev32 v7.16b, v30.16b //CTR block 8k+15
+ rev64 v15.16b, v15.16b //GHASH block 8k+7
+
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 0
+ aese v6.16b, v26.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 0
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 0
+
+ aese v7.16b, v26.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 0
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 0
+ aese v5.16b, v26.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 0
+
+ aese v4.16b, v26.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 0
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 0
+ ldp q28, q26, [x8, #32] //load rk2, rk3
+
+ eor v8.16b, v8.16b, v19.16b //PRE 1
+ ldr q20, [x3, #128] //load h5l | h5h
+ ext v20.16b, v20.16b, v20.16b, #8
+ ldr q22, [x3, #160] //load h6l | h6h
+ ext v22.16b, v22.16b, v22.16b, #8
+ aese v6.16b, v27.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 1
+
+ aese v4.16b, v27.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 1
+ rev64 v10.16b, v10.16b //GHASH block 8k+2
+ aese v3.16b, v27.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 1
+
+ aese v0.16b, v27.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 1
+ aese v5.16b, v27.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 1
+ aese v2.16b, v27.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 1
+
+ trn1 v18.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
+ aese v7.16b, v27.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 1
+ aese v1.16b, v27.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 1
+
+ aese v4.16b, v28.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 2
+ aese v0.16b, v28.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 2
+ aese v3.16b, v28.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 2
+
+ aese v6.16b, v28.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 2
+ aese v7.16b, v28.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 2
+ pmull v19.1q, v8.1d, v25.1d //GHASH block 8k - low
+
+ aese v5.16b, v28.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 2
+ aese v2.16b, v28.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 2
+ aese v1.16b, v28.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 2
+
+ ldp q27, q28, [x8, #64] //load rk4, rk5
+ pmull2 v29.1q, v10.2d, v22.2d //GHASH block 8k+2 - high
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 3
+
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 3
+ pmull2 v16.1q, v9.2d, v23.2d //GHASH block 8k+1 - high
+ pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low
+
+ aese v5.16b, v26.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 3
+ aese v6.16b, v26.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 3
+ pmull2 v17.1q, v8.2d, v25.2d //GHASH block 8k - high
+
+ aese v4.16b, v26.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 3
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 3
+ trn2 v8.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
+
+ pmull2 v9.1q, v11.2d, v20.2d //GHASH block 8k+3 - high
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 3
+ eor v17.16b, v17.16b, v16.16b //GHASH block 8k+1 - high
+
+ aese v5.16b, v27.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 4
+ aese v7.16b, v26.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 3
+ aese v3.16b, v27.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 4
+
+ aese v2.16b, v27.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 4
+ aese v0.16b, v27.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 4
+ aese v1.16b, v27.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 4
+
+ aese v6.16b, v27.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 4
+ aese v7.16b, v27.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 4
+ aese v4.16b, v27.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 4
+
+ ldr q21, [x3, #144] //load h6k | h5k
+ ldr q24, [x3, #192] //load h8k | h7k
+ eor v8.16b, v8.16b, v18.16b //GHASH block 8k, 8k+1 - mid
+ pmull v22.1q, v10.1d, v22.1d //GHASH block 8k+2 - low
+
+ ldp q26, q27, [x8, #96] //load rk6, rk7
+ aese v5.16b, v28.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 5
+ eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low
+
+ aese v0.16b, v28.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 5
+ aese v3.16b, v28.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 5
+ aese v7.16b, v28.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 5
+
+ aese v1.16b, v28.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 5
+ aese v2.16b, v28.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 5
+ aese v6.16b, v28.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 5
+
+.inst 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b, v9.16b //GHASH block 8k+2, 8k+3 - high
+ trn1 v29.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
+ rev64 v13.16b, v13.16b //GHASH block 8k+5
+
+ pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid
+ pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid
+ trn2 v10.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
+
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 6
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 6
+ aese v4.16b, v28.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 5
+
+ trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 6
+ aese v6.16b, v26.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 6
+
+ eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
+ pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low
+ aese v4.16b, v26.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 6
+
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 6
+ aese v5.16b, v26.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 6
+ aese v7.16b, v26.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 6
+
+ pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid
+ pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid
+.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low
+
+ ldr q23, [x3, #80] //load h3l | h3h
+ ext v23.16b, v23.16b, v23.16b, #8
+ ldr q25, [x3, #112] //load h4l | h4h
+ ext v25.16b, v25.16b, v25.16b, #8
+ rev64 v14.16b, v14.16b //GHASH block 8k+6
+ eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid
+
+ aese v2.16b, v27.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 7
+ aese v5.16b, v27.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 7
+ ldp q28, q26, [x8, #128] //load rk8, rk9
+
+ ldr q20, [x3, #32] //load h1l | h1h
+ ext v20.16b, v20.16b, v20.16b, #8
+ ldr q22, [x3, #64] //load h2l | h2h
+ ext v22.16b, v22.16b, v22.16b, #8
+.inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
+ aese v7.16b, v27.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 7
+
+ aese v1.16b, v27.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 7
+ aese v3.16b, v27.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 7
+ aese v6.16b, v27.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 7
+
+ ldr q21, [x3, #48] //load h2k | h1k
+ ldr q24, [x3, #96] //load h4k | h3k
+ aese v0.16b, v27.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 7
+ aese v4.16b, v27.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 7
+
+ pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high
+ pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low
+ trn2 v12.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
+
+ aese v5.16b, v28.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 8
+ pmull2 v10.1q, v13.2d, v23.2d //GHASH block 8k+5 - high
+ aese v2.16b, v28.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 8
+
+ aese v6.16b, v28.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 8
+ pmull v23.1q, v13.1d, v23.1d //GHASH block 8k+5 - low
+ aese v1.16b, v28.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 8
+
+ aese v4.16b, v28.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 8
+ aese v0.16b, v28.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 8
+ pmull2 v11.1q, v14.2d, v22.2d //GHASH block 8k+6 - high
+
+ trn1 v13.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
+ aese v3.16b, v28.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 8
+ aese v7.16b, v28.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 8
+
+ ldp q27, q28, [x8, #160] //load rk10, rk11
+ pmull v22.1q, v14.1d, v22.1d //GHASH block 8k+6 - low
+ trn2 v14.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
+
+ add v30.4s, v30.4s, v31.4s //CTR block 8k+15
+.inst 0xce082a31 //eor3 v17.16b, v17.16b, v8.16b, v10.16b //GHASH block 8k+4, 8k+5 - high
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 9
+
+ aese v6.16b, v26.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 9
+ eor v14.16b, v14.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
+ aese v5.16b, v26.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 9
+
+ ldp q8, q9, [x0], #32 //AES block 8k+8, 8k+9 - load ciphertext
+ eor v12.16b, v12.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
+ aese v7.16b, v26.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 9
+
+ pmull2 v13.1q, v14.2d, v21.2d //GHASH block 8k+6 - mid
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 9
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 9
+
+ pmull2 v16.1q, v12.2d, v24.2d //GHASH block 8k+4 - mid
+ pmull v24.1q, v12.1d, v24.1d //GHASH block 8k+5 - mid
+ pmull2 v12.1q, v15.2d, v20.2d //GHASH block 8k+7 - high
+
+ pmull v20.1q, v15.1d, v20.1d //GHASH block 8k+7 - low
+ aese v3.16b, v27.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 10
+ aese v6.16b, v27.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 10
+
+ pmull v21.1q, v14.1d, v21.1d //GHASH block 8k+7 - mid
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 9
+.inst 0xce195e73 //eor3 v19.16b, v19.16b, v25.16b, v23.16b //GHASH block 8k+4, 8k+5 - low
+
+ aese v4.16b, v26.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 9
+.inst 0xce184252 //eor3 v18.16b, v18.16b, v24.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
+.inst 0xce0b3231 //eor3 v17.16b, v17.16b, v11.16b, v12.16b //GHASH block 8k+6, 8k+7 - high
+
+ aese v2.16b, v27.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 10
+ aese v5.16b, v27.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 10
+ aese v7.16b, v27.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 10
+
+ aese v1.16b, v27.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 10
+ aese v0.16b, v27.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 10
+ aese v4.16b, v27.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 10
+
+.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+6, 8k+7 - low
+ rev32 v20.16b, v30.16b //CTR block 8k+16
+ ldr d16, [x10] //MODULO - load modulo constant
+
+ add v30.4s, v30.4s, v31.4s //CTR block 8k+16
+ aese v1.16b, v28.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 11
+ ldp q26, q27, [x8, #192] //load rk12, rk13
+
+ aese v0.16b, v28.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 11
+ aese v6.16b, v28.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 11
+
+.inst 0xce153652 //eor3 v18.16b, v18.16b, v21.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
+ rev32 v22.16b, v30.16b //CTR block 8k+17
+ aese v2.16b, v28.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 11
+
+ ldp q10, q11, [x0], #32 //AES block 8k+10, 8k+11 - load ciphertext
+ aese v7.16b, v28.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 11
+ ext v21.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment
+
+ aese v5.16b, v28.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 11
+ add v30.4s, v30.4s, v31.4s //CTR block 8k+17
+ aese v3.16b, v28.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 11
+
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 12
+ aese v7.16b, v26.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 12
+ aese v6.16b, v26.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 12
+
+ rev32 v23.16b, v30.16b //CTR block 8k+18
+ add v30.4s, v30.4s, v31.4s //CTR block 8k+18
+ pmull v29.1q, v17.1d, v16.1d //MODULO - top 64b align with mid
+
+.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 12
+ aese v4.16b, v28.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 11
+
+ ldr q28, [x8, #224] //load rk14
+ aese v5.16b, v26.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 12
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 12
+
+.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 12
+ aese v4.16b, v26.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 12
+
+ ldp q12, q13, [x0], #32 //AES block 8k+12, 8k+13 - load ciphertext
+ aese v1.16b, v27.16b //AES block 8k+9 - round 13
+ aese v2.16b, v27.16b //AES block 8k+10 - round 13
+
+ ldp q14, q15, [x0], #32 //AES block 8k+14, 8k+15 - load ciphertext
+ aese v0.16b, v27.16b //AES block 8k+8 - round 13
+ aese v5.16b, v27.16b //AES block 8k+13 - round 13
+
+ rev32 v25.16b, v30.16b //CTR block 8k+19
+.inst 0xce027142 //eor3 v2.16b, v10.16b, v2.16b, v28.16b //AES block 8k+10 - result
+.inst 0xce017121 //eor3 v1.16b, v9.16b, v1.16b, v28.16b //AES block 8k+9 - result
+
+ ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment
+ aese v7.16b, v27.16b //AES block 8k+15 - round 13
+
+ add v30.4s, v30.4s, v31.4s //CTR block 8k+19
+ pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low
+ aese v4.16b, v27.16b //AES block 8k+12 - round 13
+
+.inst 0xce0571a5 //eor3 v5.16b, v13.16b, v5.16b, v28.16b //AES block 8k+13 - result
+.inst 0xce007100 //eor3 v0.16b, v8.16b, v0.16b, v28.16b //AES block 8k+8 - result
+ aese v3.16b, v27.16b //AES block 8k+11 - round 13
+
+ stp q0, q1, [x2], #32 //AES block 8k+8, 8k+9 - store result
+ mov v0.16b, v20.16b //CTR block 8k+16
+.inst 0xce047184 //eor3 v4.16b, v12.16b, v4.16b, v28.16b //AES block 8k+12 - result
+
+.inst 0xce154673 //eor3 v19.16b, v19.16b, v21.16b, v17.16b //MODULO - fold into low
+.inst 0xce037163 //eor3 v3.16b, v11.16b, v3.16b, v28.16b //AES block 8k+11 - result
+ stp q2, q3, [x2], #32 //AES block 8k+10, 8k+11 - store result
+
+ mov v3.16b, v25.16b //CTR block 8k+19
+ mov v2.16b, v23.16b //CTR block 8k+18
+ aese v6.16b, v27.16b //AES block 8k+14 - round 13
+
+ mov v1.16b, v22.16b //CTR block 8k+17
+ stp q4, q5, [x2], #32 //AES block 8k+12, 8k+13 - store result
+.inst 0xce0771e7 //eor3 v7.16b, v15.16b, v7.16b, v28.16b //AES block 8k+15 - result
+
+.inst 0xce0671c6 //eor3 v6.16b, v14.16b, v6.16b, v28.16b //AES block 8k+14 - result
+ rev32 v4.16b, v30.16b //CTR block 8k+20
+ add v30.4s, v30.4s, v31.4s //CTR block 8k+20
+
+ cmp x0, x5 //.LOOP CONTROL
+ stp q6, q7, [x2], #32 //AES block 8k+14, 8k+15 - store result
+ b.lt .L256_dec_main_loop
+
+.L256_dec_prepretail: //PREPRETAIL
+ ldp q26, q27, [x8, #0] //load rk0, rk1
+ rev32 v5.16b, v30.16b //CTR block 8k+13
+ add v30.4s, v30.4s, v31.4s //CTR block 8k+13
+
+ rev64 v12.16b, v12.16b //GHASH block 8k+4
+ ldr q21, [x3, #144] //load h6k | h5k
+ ldr q24, [x3, #192] //load h8k | h7k
+
+ rev32 v6.16b, v30.16b //CTR block 8k+14
+ rev64 v8.16b, v8.16b //GHASH block 8k
+ add v30.4s, v30.4s, v31.4s //CTR block 8k+14
+
+ ext v19.16b, v19.16b, v19.16b, #8 //PRE 0
+ ldr q23, [x3, #176] //load h7l | h7h
+ ext v23.16b, v23.16b, v23.16b, #8
+ ldr q25, [x3, #208] //load h8l | h8h
+ ext v25.16b, v25.16b, v25.16b, #8
+ rev64 v9.16b, v9.16b //GHASH block 8k+1
+
+ rev32 v7.16b, v30.16b //CTR block 8k+15
+ rev64 v10.16b, v10.16b //GHASH block 8k+2
+ ldr q20, [x3, #128] //load h5l | h5h
+ ext v20.16b, v20.16b, v20.16b, #8
+ ldr q22, [x3, #160] //load h6l | h6h
+ ext v22.16b, v22.16b, v22.16b, #8
+
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 0
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 0
+ aese v4.16b, v26.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 0
+
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 0
+ aese v5.16b, v26.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 0
+ aese v6.16b, v26.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 0
+
+ aese v4.16b, v27.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 1
+ aese v7.16b, v26.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 0
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 0
+
+ ldp q28, q26, [x8, #32] //load rk2, rk3
+ aese v0.16b, v27.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 1
+ eor v8.16b, v8.16b, v19.16b //PRE 1
+
+ aese v7.16b, v27.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 1
+ aese v6.16b, v27.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 1
+ aese v2.16b, v27.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 1
+
+ aese v3.16b, v27.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 1
+ aese v1.16b, v27.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 1
+ aese v5.16b, v27.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 1
+
+ pmull2 v16.1q, v9.2d, v23.2d //GHASH block 8k+1 - high
+ trn1 v18.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
+ pmull v19.1q, v8.1d, v25.1d //GHASH block 8k - low
+
+ rev64 v11.16b, v11.16b //GHASH block 8k+3
+ pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low
+
+ aese v5.16b, v28.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 2
+ aese v7.16b, v28.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 2
+ aese v1.16b, v28.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 2
+
+ aese v3.16b, v28.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 2
+ aese v6.16b, v28.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 2
+ pmull2 v17.1q, v8.2d, v25.2d //GHASH block 8k - high
+
+ aese v0.16b, v28.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 2
+ aese v7.16b, v26.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 3
+
+ aese v5.16b, v26.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 3
+ rev64 v14.16b, v14.16b //GHASH block 8k+6
+
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 3
+ aese v2.16b, v28.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 2
+ aese v6.16b, v26.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 3
+
+ pmull2 v29.1q, v10.2d, v22.2d //GHASH block 8k+2 - high
+ trn2 v8.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
+ aese v4.16b, v28.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 2
+
+ ldp q27, q28, [x8, #64] //load rk4, rk5
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 3
+ pmull2 v9.1q, v11.2d, v20.2d //GHASH block 8k+3 - high
+
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 3
+ eor v17.16b, v17.16b, v16.16b //GHASH block 8k+1 - high
+ eor v8.16b, v8.16b, v18.16b //GHASH block 8k, 8k+1 - mid
+
+ aese v4.16b, v26.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 3
+ pmull v22.1q, v10.1d, v22.1d //GHASH block 8k+2 - low
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 3
+
+.inst 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b, v9.16b //GHASH block 8k+2, 8k+3 - high
+ trn1 v29.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
+ trn2 v10.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
+
+ pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid
+ pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low
+ eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low
+
+ pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid
+ aese v5.16b, v27.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 4
+ aese v0.16b, v27.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 4
+
+.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low
+ ldr q20, [x3, #32] //load h1l | h1h
+ ext v20.16b, v20.16b, v20.16b, #8
+ ldr q22, [x3, #64] //load h2l | h2h
+ ext v22.16b, v22.16b, v22.16b, #8
+ aese v7.16b, v27.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 4
+
+ aese v2.16b, v27.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 4
+ aese v6.16b, v27.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 4
+ eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid
+
+ eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
+ aese v7.16b, v28.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 5
+ aese v1.16b, v27.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 4
+
+ aese v2.16b, v28.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 5
+ aese v3.16b, v27.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 4
+ aese v4.16b, v27.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 4
+
+ aese v1.16b, v28.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 5
+ pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid
+ aese v6.16b, v28.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 5
+
+ aese v4.16b, v28.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 5
+ aese v3.16b, v28.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 5
+ pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid
+
+ aese v0.16b, v28.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 5
+ aese v5.16b, v28.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 5
+ ldp q26, q27, [x8, #96] //load rk6, rk7
+
+ ldr q23, [x3, #80] //load h3l | h3h
+ ext v23.16b, v23.16b, v23.16b, #8
+ ldr q25, [x3, #112] //load h4l | h4h
+ ext v25.16b, v25.16b, v25.16b, #8
+ rev64 v15.16b, v15.16b //GHASH block 8k+7
+ rev64 v13.16b, v13.16b //GHASH block 8k+5
+
+.inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
+
+ trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
+
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 6
+ ldr q21, [x3, #48] //load h2k | h1k
+ ldr q24, [x3, #96] //load h4k | h3k
+ aese v6.16b, v26.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 6
+
+ aese v5.16b, v26.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 6
+ aese v7.16b, v26.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 6
+
+ pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high
+ pmull2 v10.1q, v13.2d, v23.2d //GHASH block 8k+5 - high
+ pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low
+
+ trn2 v12.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
+ pmull v23.1q, v13.1d, v23.1d //GHASH block 8k+5 - low
+ trn1 v13.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
+
+ aese v7.16b, v27.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 7
+ pmull2 v11.1q, v14.2d, v22.2d //GHASH block 8k+6 - high
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 6
+
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 6
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 6
+ aese v4.16b, v26.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 6
+
+ ldp q28, q26, [x8, #128] //load rk8, rk9
+ pmull v22.1q, v14.1d, v22.1d //GHASH block 8k+6 - low
+ aese v5.16b, v27.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 7
+
+ aese v1.16b, v27.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 7
+ aese v4.16b, v27.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 7
+
+ aese v6.16b, v27.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 7
+ aese v2.16b, v27.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 7
+.inst 0xce082a31 //eor3 v17.16b, v17.16b, v8.16b, v10.16b //GHASH block 8k+4, 8k+5 - high
+
+ aese v0.16b, v27.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 7
+ trn2 v14.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
+ aese v3.16b, v27.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 7
+
+ aese v0.16b, v28.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 8
+ aese v7.16b, v28.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 8
+ aese v4.16b, v28.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 8
+
+ aese v1.16b, v28.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 8
+ aese v5.16b, v28.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 8
+ aese v6.16b, v28.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 8
+
+ aese v3.16b, v28.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 8
+ aese v4.16b, v26.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 9
+ eor v12.16b, v12.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
+
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 9
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 9
+ eor v14.16b, v14.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
+
+ aese v6.16b, v26.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 9
+ aese v7.16b, v26.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 9
+ pmull2 v16.1q, v12.2d, v24.2d //GHASH block 8k+4 - mid
+
+ aese v2.16b, v28.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 8
+ pmull v24.1q, v12.1d, v24.1d //GHASH block 8k+5 - mid
+ pmull2 v12.1q, v15.2d, v20.2d //GHASH block 8k+7 - high
+
+ pmull2 v13.1q, v14.2d, v21.2d //GHASH block 8k+6 - mid
+ pmull v21.1q, v14.1d, v21.1d //GHASH block 8k+7 - mid
+ pmull v20.1q, v15.1d, v20.1d //GHASH block 8k+7 - low
+
+ ldp q27, q28, [x8, #160] //load rk10, rk11
+.inst 0xce195e73 //eor3 v19.16b, v19.16b, v25.16b, v23.16b //GHASH block 8k+4, 8k+5 - low
+.inst 0xce184252 //eor3 v18.16b, v18.16b, v24.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
+
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 9
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 9
+ aese v5.16b, v26.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 9
+
+.inst 0xce0b3231 //eor3 v17.16b, v17.16b, v11.16b, v12.16b //GHASH block 8k+6, 8k+7 - high
+.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+6, 8k+7 - low
+ ldr d16, [x10] //MODULO - load modulo constant
+
+.inst 0xce153652 //eor3 v18.16b, v18.16b, v21.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
+
+ aese v4.16b, v27.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 10
+ aese v6.16b, v27.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 10
+ aese v5.16b, v27.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 10
+
+ aese v0.16b, v27.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 10
+ aese v2.16b, v27.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 10
+ aese v3.16b, v27.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 10
+
+.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up
+
+ aese v7.16b, v27.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 10
+ aese v1.16b, v27.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 10
+ ldp q26, q27, [x8, #192] //load rk12, rk13
+
+ ext v21.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment
+
+ aese v2.16b, v28.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 11
+ aese v1.16b, v28.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 11
+ aese v0.16b, v28.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 11
+
+ pmull v29.1q, v17.1d, v16.1d //MODULO - top 64b align with mid
+ aese v3.16b, v28.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 11
+
+ aese v7.16b, v28.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 11
+ aese v6.16b, v28.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 11
+ aese v4.16b, v28.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 11
+
+ aese v5.16b, v28.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 11
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b //AES block 8k+11 - round 12
+
+.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid
+
+ aese v3.16b, v27.16b //AES block 8k+11 - round 13
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b //AES block 8k+10 - round 12
+ aese v6.16b, v26.16b
+ aesmc v6.16b, v6.16b //AES block 8k+14 - round 12
+
+ pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low
+ aese v4.16b, v26.16b
+ aesmc v4.16b, v4.16b //AES block 8k+12 - round 12
+ aese v7.16b, v26.16b
+ aesmc v7.16b, v7.16b //AES block 8k+15 - round 12
+
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b //AES block 8k+8 - round 12
+ ldr q28, [x8, #224] //load rk14
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b //AES block 8k+9 - round 12
+
+ aese v4.16b, v27.16b //AES block 8k+12 - round 13
+ ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment
+ aese v5.16b, v26.16b
+ aesmc v5.16b, v5.16b //AES block 8k+13 - round 12
+
+ aese v6.16b, v27.16b //AES block 8k+14 - round 13
+ aese v2.16b, v27.16b //AES block 8k+10 - round 13
+ aese v1.16b, v27.16b //AES block 8k+9 - round 13
+
+ aese v5.16b, v27.16b //AES block 8k+13 - round 13
+.inst 0xce154673 //eor3 v19.16b, v19.16b, v21.16b, v17.16b //MODULO - fold into low
+ add v30.4s, v30.4s, v31.4s //CTR block 8k+15
+
+ aese v7.16b, v27.16b //AES block 8k+15 - round 13
+ aese v0.16b, v27.16b //AES block 8k+8 - round 13
+.L256_dec_tail: //TAIL
+
+ ext v16.16b, v19.16b, v19.16b, #8 //prepare final partial tag
+ sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process
+ cmp x5, #112
+
+ ldr q9, [x0], #16 //AES block 8k+8 - load ciphertext
+
+ ldp q24, q25, [x3, #192] //load h8k | h7k
+ ext v25.16b, v25.16b, v25.16b, #8
+ mov v29.16b, v28.16b
+
+ ldp q20, q21, [x3, #128] //load h5l | h5h
+ ext v20.16b, v20.16b, v20.16b, #8
+
+.inst 0xce00752c //eor3 v12.16b, v9.16b, v0.16b, v29.16b //AES block 8k+8 - result
+ ldp q22, q23, [x3, #160] //load h6l | h6h
+ ext v22.16b, v22.16b, v22.16b, #8
+ ext v23.16b, v23.16b, v23.16b, #8
+ b.gt .L256_dec_blocks_more_than_7
+
+ mov v7.16b, v6.16b
+ sub v30.4s, v30.4s, v31.4s
+ mov v6.16b, v5.16b
+
+ mov v5.16b, v4.16b
+ mov v4.16b, v3.16b
+ movi v19.8b, #0
+
+ movi v17.8b, #0
+ movi v18.8b, #0
+ mov v3.16b, v2.16b
+
+ cmp x5, #96
+ mov v2.16b, v1.16b
+ b.gt .L256_dec_blocks_more_than_6
+
+ mov v7.16b, v6.16b
+ mov v6.16b, v5.16b
+
+ mov v5.16b, v4.16b
+ cmp x5, #80
+ sub v30.4s, v30.4s, v31.4s
+
+ mov v4.16b, v3.16b
+ mov v3.16b, v1.16b
+ b.gt .L256_dec_blocks_more_than_5
+
+ cmp x5, #64
+ mov v7.16b, v6.16b
+ sub v30.4s, v30.4s, v31.4s
+
+ mov v6.16b, v5.16b
+
+ mov v5.16b, v4.16b
+ mov v4.16b, v1.16b
+ b.gt .L256_dec_blocks_more_than_4
+
+ sub v30.4s, v30.4s, v31.4s
+ mov v7.16b, v6.16b
+ cmp x5, #48
+
+ mov v6.16b, v5.16b
+ mov v5.16b, v1.16b
+ b.gt .L256_dec_blocks_more_than_3
+
+ ldr q24, [x3, #96] //load h4k | h3k
+ sub v30.4s, v30.4s, v31.4s
+ mov v7.16b, v6.16b
+
+ cmp x5, #32
+ mov v6.16b, v1.16b
+ b.gt .L256_dec_blocks_more_than_2
+
+ sub v30.4s, v30.4s, v31.4s
+
+ mov v7.16b, v1.16b
+ cmp x5, #16
+ b.gt .L256_dec_blocks_more_than_1
+
+ sub v30.4s, v30.4s, v31.4s
+ ldr q21, [x3, #48] //load h2k | h1k
+ b .L256_dec_blocks_less_than_1
+.L256_dec_blocks_more_than_7: //blocks left > 7
+ rev64 v8.16b, v9.16b //GHASH final-7 block
+ ldr q9, [x0], #16 //AES final-6 block - load ciphertext
+ st1 { v12.16b}, [x2], #16 //AES final-7 block - store result
+
+ ins v18.d[0], v24.d[1] //GHASH final-7 block - mid
+
+ eor v8.16b, v8.16b, v16.16b //feed in partial tag
+
+ ins v27.d[0], v8.d[1] //GHASH final-7 block - mid
+.inst 0xce01752c //eor3 v12.16b, v9.16b, v1.16b, v29.16b //AES final-6 block - result
+
+ pmull2 v17.1q, v8.2d, v25.2d //GHASH final-7 block - high
+
+ eor v27.8b, v27.8b, v8.8b //GHASH final-7 block - mid
+ movi v16.8b, #0 //suppress further partial tag feed in
+
+ pmull v19.1q, v8.1d, v25.1d //GHASH final-7 block - low
+ pmull v18.1q, v27.1d, v18.1d //GHASH final-7 block - mid
+.L256_dec_blocks_more_than_6: //blocks left > 6
+
+ rev64 v8.16b, v9.16b //GHASH final-6 block
+
+ eor v8.16b, v8.16b, v16.16b //feed in partial tag
+ ldr q9, [x0], #16 //AES final-5 block - load ciphertext
+ movi v16.8b, #0 //suppress further partial tag feed in
+
+ ins v27.d[0], v8.d[1] //GHASH final-6 block - mid
+ st1 { v12.16b}, [x2], #16 //AES final-6 block - store result
+ pmull2 v28.1q, v8.2d, v23.2d //GHASH final-6 block - high
+
+ pmull v26.1q, v8.1d, v23.1d //GHASH final-6 block - low
+
+.inst 0xce02752c //eor3 v12.16b, v9.16b, v2.16b, v29.16b //AES final-5 block - result
+ eor v19.16b, v19.16b, v26.16b //GHASH final-6 block - low
+ eor v27.8b, v27.8b, v8.8b //GHASH final-6 block - mid
+
+ pmull v27.1q, v27.1d, v24.1d //GHASH final-6 block - mid
+
+ eor v18.16b, v18.16b, v27.16b //GHASH final-6 block - mid
+ eor v17.16b, v17.16b, v28.16b //GHASH final-6 block - high
+.L256_dec_blocks_more_than_5: //blocks left > 5
+
+ rev64 v8.16b, v9.16b //GHASH final-5 block
+
+ eor v8.16b, v8.16b, v16.16b //feed in partial tag
+
+ pmull2 v28.1q, v8.2d, v22.2d //GHASH final-5 block - high
+ ins v27.d[0], v8.d[1] //GHASH final-5 block - mid
+
+ ldr q9, [x0], #16 //AES final-4 block - load ciphertext
+
+ eor v27.8b, v27.8b, v8.8b //GHASH final-5 block - mid
+ st1 { v12.16b}, [x2], #16 //AES final-5 block - store result
+
+ pmull v26.1q, v8.1d, v22.1d //GHASH final-5 block - low
+ ins v27.d[1], v27.d[0] //GHASH final-5 block - mid
+
+ pmull2 v27.1q, v27.2d, v21.2d //GHASH final-5 block - mid
+
+ eor v17.16b, v17.16b, v28.16b //GHASH final-5 block - high
+.inst 0xce03752c //eor3 v12.16b, v9.16b, v3.16b, v29.16b //AES final-4 block - result
+ eor v19.16b, v19.16b, v26.16b //GHASH final-5 block - low
+
+ eor v18.16b, v18.16b, v27.16b //GHASH final-5 block - mid
+ movi v16.8b, #0 //suppress further partial tag feed in
+.L256_dec_blocks_more_than_4: //blocks left > 4
+
+ rev64 v8.16b, v9.16b //GHASH final-4 block
+
+ eor v8.16b, v8.16b, v16.16b //feed in partial tag
+
+ ins v27.d[0], v8.d[1] //GHASH final-4 block - mid
+ ldr q9, [x0], #16 //AES final-3 block - load ciphertext
+
+ movi v16.8b, #0 //suppress further partial tag feed in
+
+ pmull v26.1q, v8.1d, v20.1d //GHASH final-4 block - low
+ pmull2 v28.1q, v8.2d, v20.2d //GHASH final-4 block - high
+
+ eor v27.8b, v27.8b, v8.8b //GHASH final-4 block - mid
+
+ eor v17.16b, v17.16b, v28.16b //GHASH final-4 block - high
+
+ pmull v27.1q, v27.1d, v21.1d //GHASH final-4 block - mid
+
+ eor v19.16b, v19.16b, v26.16b //GHASH final-4 block - low
+ st1 { v12.16b}, [x2], #16 //AES final-4 block - store result
+
+ eor v18.16b, v18.16b, v27.16b //GHASH final-4 block - mid
+.inst 0xce04752c //eor3 v12.16b, v9.16b, v4.16b, v29.16b //AES final-3 block - result
+.L256_dec_blocks_more_than_3: //blocks left > 3
+
+ ldr q25, [x3, #112] //load h4l | h4h
+ ext v25.16b, v25.16b, v25.16b, #8
+ rev64 v8.16b, v9.16b //GHASH final-3 block
+
+ eor v8.16b, v8.16b, v16.16b //feed in partial tag
+ ldr q9, [x0], #16 //AES final-2 block - load ciphertext
+ ldr q24, [x3, #96] //load h4k | h3k
+
+ ins v27.d[0], v8.d[1] //GHASH final-3 block - mid
+ st1 { v12.16b}, [x2], #16 //AES final-3 block - store result
+
+.inst 0xce05752c //eor3 v12.16b, v9.16b, v5.16b, v29.16b //AES final-2 block - result
+
+ eor v27.8b, v27.8b, v8.8b //GHASH final-3 block - mid
+
+ ins v27.d[1], v27.d[0] //GHASH final-3 block - mid
+ pmull v26.1q, v8.1d, v25.1d //GHASH final-3 block - low
+ pmull2 v28.1q, v8.2d, v25.2d //GHASH final-3 block - high
+
+ movi v16.8b, #0 //suppress further partial tag feed in
+ pmull2 v27.1q, v27.2d, v24.2d //GHASH final-3 block - mid
+ eor v19.16b, v19.16b, v26.16b //GHASH final-3 block - low
+
+ eor v17.16b, v17.16b, v28.16b //GHASH final-3 block - high
+
+ eor v18.16b, v18.16b, v27.16b //GHASH final-3 block - mid
+.L256_dec_blocks_more_than_2: //blocks left > 2
+
+ rev64 v8.16b, v9.16b //GHASH final-2 block
+
+ ldr q23, [x3, #80] //load h3l | h3h
+ ext v23.16b, v23.16b, v23.16b, #8
+ ldr q9, [x0], #16 //AES final-1 block - load ciphertext
+
+ eor v8.16b, v8.16b, v16.16b //feed in partial tag
+
+ ins v27.d[0], v8.d[1] //GHASH final-2 block - mid
+
+ pmull v26.1q, v8.1d, v23.1d //GHASH final-2 block - low
+ st1 { v12.16b}, [x2], #16 //AES final-2 block - store result
+.inst 0xce06752c //eor3 v12.16b, v9.16b, v6.16b, v29.16b //AES final-1 block - result
+
+ eor v27.8b, v27.8b, v8.8b //GHASH final-2 block - mid
+ eor v19.16b, v19.16b, v26.16b //GHASH final-2 block - low
+ movi v16.8b, #0 //suppress further partial tag feed in
+
+ pmull v27.1q, v27.1d, v24.1d //GHASH final-2 block - mid
+ pmull2 v28.1q, v8.2d, v23.2d //GHASH final-2 block - high
+
+ eor v18.16b, v18.16b, v27.16b //GHASH final-2 block - mid
+ eor v17.16b, v17.16b, v28.16b //GHASH final-2 block - high
+.L256_dec_blocks_more_than_1: //blocks left > 1
+
+ rev64 v8.16b, v9.16b //GHASH final-1 block
+
+ eor v8.16b, v8.16b, v16.16b //feed in partial tag
+
+ ins v27.d[0], v8.d[1] //GHASH final-1 block - mid
+ ldr q22, [x3, #64] //load h2l | h2h
+ ext v22.16b, v22.16b, v22.16b, #8
+
+ eor v27.8b, v27.8b, v8.8b //GHASH final-1 block - mid
+ ldr q9, [x0], #16 //AES final block - load ciphertext
+ st1 { v12.16b}, [x2], #16 //AES final-1 block - store result
+
+ ldr q21, [x3, #48] //load h2k | h1k
+ pmull v26.1q, v8.1d, v22.1d //GHASH final-1 block - low
+
+ ins v27.d[1], v27.d[0] //GHASH final-1 block - mid
+
+ eor v19.16b, v19.16b, v26.16b //GHASH final-1 block - low
+
+.inst 0xce07752c //eor3 v12.16b, v9.16b, v7.16b, v29.16b //AES final block - result
+ pmull2 v28.1q, v8.2d, v22.2d //GHASH final-1 block - high
+
+ pmull2 v27.1q, v27.2d, v21.2d //GHASH final-1 block - mid
+
+ movi v16.8b, #0 //suppress further partial tag feed in
+ eor v17.16b, v17.16b, v28.16b //GHASH final-1 block - high
+
+ eor v18.16b, v18.16b, v27.16b //GHASH final-1 block - mid
+.L256_dec_blocks_less_than_1: //blocks left <= 1
+
+ ld1 { v26.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored
+ mvn x6, xzr //temp0_x = 0xffffffffffffffff
+ and x1, x1, #127 //bit_length %= 128
+
+ sub x1, x1, #128 //bit_length -= 128
+ rev32 v30.16b, v30.16b
+ str q30, [x16] //store the updated counter
+
+ neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128])
+
+ and x1, x1, #127 //bit_length %= 128
+
+ lsr x6, x6, x1 //temp0_x is mask for top 64b of last block
+ cmp x1, #64
+ mvn x7, xzr //temp1_x = 0xffffffffffffffff
+
+ csel x14, x6, xzr, lt
+ csel x13, x7, x6, lt
+
+ mov v0.d[0], x13 //ctr0b is mask for last block
+ mov v0.d[1], x14
+
+ and v9.16b, v9.16b, v0.16b //possibly partial last block has zeroes in highest bits
+ ldr q20, [x3, #32] //load h1l | h1h
+ ext v20.16b, v20.16b, v20.16b, #8
+ bif v12.16b, v26.16b, v0.16b //insert existing bytes in top end of result before storing
+
+ rev64 v8.16b, v9.16b //GHASH final block
+
+ eor v8.16b, v8.16b, v16.16b //feed in partial tag
+
+ ins v16.d[0], v8.d[1] //GHASH final block - mid
+ pmull2 v28.1q, v8.2d, v20.2d //GHASH final block - high
+
+ eor v16.8b, v16.8b, v8.8b //GHASH final block - mid
+
+ pmull v26.1q, v8.1d, v20.1d //GHASH final block - low
+ eor v17.16b, v17.16b, v28.16b //GHASH final block - high
+
+ pmull v16.1q, v16.1d, v21.1d //GHASH final block - mid
+
+ eor v18.16b, v18.16b, v16.16b //GHASH final block - mid
+ ldr d16, [x10] //MODULO - load modulo constant
+ eor v19.16b, v19.16b, v26.16b //GHASH final block - low
+
+ pmull v21.1q, v17.1d, v16.1d //MODULO - top 64b align with mid
+ eor v14.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up
+
+ ext v17.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment
+ st1 { v12.16b}, [x2] //store all 16B
+
+ eor v18.16b, v18.16b, v14.16b //MODULO - karatsuba tidy up
+
+ eor v21.16b, v17.16b, v21.16b //MODULO - fold into mid
+ eor v18.16b, v18.16b, v21.16b //MODULO - fold into mid
+
+ pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low
+
+ ext v18.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment
+ eor v19.16b, v19.16b, v17.16b //MODULO - fold into low
+
+ eor v19.16b, v19.16b, v18.16b //MODULO - fold into low
+ ext v19.16b, v19.16b, v19.16b, #8
+ rev64 v19.16b, v19.16b
+ st1 { v19.16b }, [x3]
+ mov x0, x9
+
+ ldp d10, d11, [sp, #16]
+ ldp d12, d13, [sp, #32]
+ ldp d14, d15, [sp, #48]
+ ldp d8, d9, [sp], #80
+ ret
+
+.L256_dec_ret:
+ mov w0, #0x0
+ ret
+.size unroll8_eor3_aes_gcm_dec_256_kernel,.-unroll8_eor3_aes_gcm_dec_256_kernel
+.byte 65,69,83,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,65,82,77,118,56,44,32,83,80,68,88,32,66,83,68,45,51,45,67,108,97,117,115,101,32,98,121,32,60,120,105,97,111,107,97,110,103,46,113,105,97,110,64,97,114,109,46,99,111,109,62,0
+.align 2
+.align 2
+#endif
diff --git a/sys/crypto/openssl/aarch64/aes-gcm-armv8_64.S b/sys/crypto/openssl/aarch64/aes-gcm-armv8_64.S
index 55856548fa6f..d8082ccbe0a7 100644
--- a/sys/crypto/openssl/aarch64/aes-gcm-armv8_64.S
+++ b/sys/crypto/openssl/aarch64/aes-gcm-armv8_64.S
@@ -6390,6 +6390,7 @@ aes_gcm_dec_256_kernel:
mov w0, #0x0
ret
.size aes_gcm_dec_256_kernel,.-aes_gcm_dec_256_kernel
+.section .rodata
.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.align 2
.align 2
diff --git a/sys/crypto/openssl/aarch64/aesv8-armx.S b/sys/crypto/openssl/aarch64/aesv8-armx.S
index 015c2eea6dbb..d46d1f0a208c 100644
--- a/sys/crypto/openssl/aarch64/aesv8-armx.S
+++ b/sys/crypto/openssl/aarch64/aesv8-armx.S
@@ -4,12 +4,13 @@
#if __ARM_MAX_ARCH__>=7
.arch armv8-a+crypto
.text
+.section .rodata
.align 5
.Lrcon:
.long 0x01,0x01,0x01,0x01
.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat
.long 0x1b,0x1b,0x1b,0x1b
-
+.previous
.globl aes_v8_set_encrypt_key
.type aes_v8_set_encrypt_key,%function
.align 5
@@ -32,7 +33,8 @@ aes_v8_set_encrypt_key:
tst w1,#0x3f
b.ne .Lenc_key_abort
- adr x3,.Lrcon
+ adrp x3,.Lrcon
+ add x3,x3,#:lo12:.Lrcon
cmp w1,#192
eor v0.16b,v0.16b,v0.16b
@@ -1509,6 +1511,729 @@ aes_v8_cbc_encrypt:
ldr x29,[sp],#16
ret
.size aes_v8_cbc_encrypt,.-aes_v8_cbc_encrypt
+.globl aes_v8_ctr32_encrypt_blocks_unroll12_eor3
+.type aes_v8_ctr32_encrypt_blocks_unroll12_eor3,%function
+.align 5
+aes_v8_ctr32_encrypt_blocks_unroll12_eor3:
+ AARCH64_VALID_CALL_TARGET
+ // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+ stp x29,x30,[sp,#-80]!
+ stp d8,d9,[sp, #16]
+ stp d10,d11,[sp, #32]
+ stp d12,d13,[sp, #48]
+ stp d14,d15,[sp, #64]
+ add x29,sp,#0
+
+ ldr w5,[x3,#240]
+
+ ldr w8, [x4, #12]
+#ifdef __AARCH64EB__
+ ld1 {v24.16b},[x4]
+#else
+ ld1 {v24.4s},[x4]
+#endif
+ ld1 {v2.4s,v3.4s},[x3] // load key schedule...
+ sub w5,w5,#4
+ cmp x2,#2
+ add x7,x3,x5,lsl#4 // pointer to last round key
+ sub w5,w5,#2
+ add x7, x7, #64
+ ld1 {v1.4s},[x7]
+ add x7,x3,#32
+ mov w6,w5
+#ifndef __AARCH64EB__
+ rev w8, w8
+#endif
+
+ orr v25.16b,v24.16b,v24.16b
+ add w10, w8, #1
+ orr v26.16b,v24.16b,v24.16b
+ add w8, w8, #2
+ orr v0.16b,v24.16b,v24.16b
+ rev w10, w10
+ mov v25.s[3],w10
+ b.ls .Lctr32_tail_unroll
+ cmp x2,#6
+ rev w12, w8
+ sub x2,x2,#3 // bias
+ mov v26.s[3],w12
+ b.lo .Loop3x_ctr32_unroll
+ cmp x2,#9
+ orr v27.16b,v24.16b,v24.16b
+ add w11, w8, #1
+ orr v28.16b,v24.16b,v24.16b
+ add w13, w8, #2
+ rev w11, w11
+ orr v29.16b,v24.16b,v24.16b
+ add w8, w8, #3
+ rev w13, w13
+ mov v27.s[3],w11
+ rev w14, w8
+ mov v28.s[3],w13
+ mov v29.s[3],w14
+ sub x2,x2,#3
+ b.lo .Loop6x_ctr32_unroll
+
+ // push regs to stack when 12 data chunks are interleaved
+ stp x19,x20,[sp,#-16]!
+ stp x21,x22,[sp,#-16]!
+ stp x23,x24,[sp,#-16]!
+ stp d8,d9,[sp,#-32]!
+ stp d10,d11,[sp,#-32]!
+
+ add w15,w8,#1
+ add w19,w8,#2
+ add w20,w8,#3
+ add w21,w8,#4
+ add w22,w8,#5
+ add w8,w8,#6
+ orr v30.16b,v24.16b,v24.16b
+ rev w15,w15
+ orr v31.16b,v24.16b,v24.16b
+ rev w19,w19
+ orr v8.16b,v24.16b,v24.16b
+ rev w20,w20
+ orr v9.16b,v24.16b,v24.16b
+ rev w21,w21
+ orr v10.16b,v24.16b,v24.16b
+ rev w22,w22
+ orr v11.16b,v24.16b,v24.16b
+ rev w23,w8
+
+ sub x2,x2,#6 // bias
+ mov v30.s[3],w15
+ mov v31.s[3],w19
+ mov v8.s[3],w20
+ mov v9.s[3],w21
+ mov v10.s[3],w22
+ mov v11.s[3],w23
+ b .Loop12x_ctr32_unroll
+
+.align 4
+.Loop12x_ctr32_unroll:
+ aese v24.16b,v2.16b
+ aesmc v24.16b,v24.16b
+ aese v25.16b,v2.16b
+ aesmc v25.16b,v25.16b
+ aese v26.16b,v2.16b
+ aesmc v26.16b,v26.16b
+ aese v27.16b,v2.16b
+ aesmc v27.16b,v27.16b
+ aese v28.16b,v2.16b
+ aesmc v28.16b,v28.16b
+ aese v29.16b,v2.16b
+ aesmc v29.16b,v29.16b
+ aese v30.16b,v2.16b
+ aesmc v30.16b,v30.16b
+ aese v31.16b,v2.16b
+ aesmc v31.16b,v31.16b
+ aese v8.16b,v2.16b
+ aesmc v8.16b,v8.16b
+ aese v9.16b,v2.16b
+ aesmc v9.16b,v9.16b
+ aese v10.16b,v2.16b
+ aesmc v10.16b,v10.16b
+ aese v11.16b,v2.16b
+ aesmc v11.16b,v11.16b
+ ld1 {v2.4s},[x7],#16
+ subs w6,w6,#2
+ aese v24.16b,v3.16b
+ aesmc v24.16b,v24.16b
+ aese v25.16b,v3.16b
+ aesmc v25.16b,v25.16b
+ aese v26.16b,v3.16b
+ aesmc v26.16b,v26.16b
+ aese v27.16b,v3.16b
+ aesmc v27.16b,v27.16b
+ aese v28.16b,v3.16b
+ aesmc v28.16b,v28.16b
+ aese v29.16b,v3.16b
+ aesmc v29.16b,v29.16b
+ aese v30.16b,v3.16b
+ aesmc v30.16b,v30.16b
+ aese v31.16b,v3.16b
+ aesmc v31.16b,v31.16b
+ aese v8.16b,v3.16b
+ aesmc v8.16b,v8.16b
+ aese v9.16b,v3.16b
+ aesmc v9.16b,v9.16b
+ aese v10.16b,v3.16b
+ aesmc v10.16b,v10.16b
+ aese v11.16b,v3.16b
+ aesmc v11.16b,v11.16b
+ ld1 {v3.4s},[x7],#16
+ b.gt .Loop12x_ctr32_unroll
+
+ aese v24.16b,v2.16b
+ aesmc v24.16b,v24.16b
+ aese v25.16b,v2.16b
+ aesmc v25.16b,v25.16b
+ aese v26.16b,v2.16b
+ aesmc v26.16b,v26.16b
+ aese v27.16b,v2.16b
+ aesmc v27.16b,v27.16b
+ aese v28.16b,v2.16b
+ aesmc v28.16b,v28.16b
+ aese v29.16b,v2.16b
+ aesmc v29.16b,v29.16b
+ aese v30.16b,v2.16b
+ aesmc v30.16b,v30.16b
+ aese v31.16b,v2.16b
+ aesmc v31.16b,v31.16b
+ aese v8.16b,v2.16b
+ aesmc v8.16b,v8.16b
+ aese v9.16b,v2.16b
+ aesmc v9.16b,v9.16b
+ aese v10.16b,v2.16b
+ aesmc v10.16b,v10.16b
+ aese v11.16b,v2.16b
+ aesmc v11.16b,v11.16b
+ ld1 {v2.4s},[x7],#16
+
+ aese v24.16b,v3.16b
+ aesmc v24.16b,v24.16b
+ aese v25.16b,v3.16b
+ aesmc v25.16b,v25.16b
+ aese v26.16b,v3.16b
+ aesmc v26.16b,v26.16b
+ aese v27.16b,v3.16b
+ aesmc v27.16b,v27.16b
+ aese v28.16b,v3.16b
+ aesmc v28.16b,v28.16b
+ aese v29.16b,v3.16b
+ aesmc v29.16b,v29.16b
+ aese v30.16b,v3.16b
+ aesmc v30.16b,v30.16b
+ aese v31.16b,v3.16b
+ aesmc v31.16b,v31.16b
+ aese v8.16b,v3.16b
+ aesmc v8.16b,v8.16b
+ aese v9.16b,v3.16b
+ aesmc v9.16b,v9.16b
+ aese v10.16b,v3.16b
+ aesmc v10.16b,v10.16b
+ aese v11.16b,v3.16b
+ aesmc v11.16b,v11.16b
+ ld1 {v3.4s},[x7],#16
+
+ aese v24.16b,v2.16b
+ aesmc v24.16b,v24.16b
+ add w9,w8,#1
+ add w10,w8,#2
+ aese v25.16b,v2.16b
+ aesmc v25.16b,v25.16b
+ add w12,w8,#3
+ add w11,w8,#4
+ aese v26.16b,v2.16b
+ aesmc v26.16b,v26.16b
+ add w13,w8,#5
+ add w14,w8,#6
+ rev w9,w9
+ aese v27.16b,v2.16b
+ aesmc v27.16b,v27.16b
+ add w15,w8,#7
+ add w19,w8,#8
+ rev w10,w10
+ rev w12,w12
+ aese v28.16b,v2.16b
+ aesmc v28.16b,v28.16b
+ add w20,w8,#9
+ add w21,w8,#10
+ rev w11,w11
+ rev w13,w13
+ aese v29.16b,v2.16b
+ aesmc v29.16b,v29.16b
+ add w22,w8,#11
+ add w23,w8,#12
+ rev w14,w14
+ rev w15,w15
+ aese v30.16b,v2.16b
+ aesmc v30.16b,v30.16b
+ rev w19,w19
+ rev w20,w20
+ aese v31.16b,v2.16b
+ aesmc v31.16b,v31.16b
+ rev w21,w21
+ rev w22,w22
+ aese v8.16b,v2.16b
+ aesmc v8.16b,v8.16b
+ rev w23,w23
+ aese v9.16b,v2.16b
+ aesmc v9.16b,v9.16b
+ aese v10.16b,v2.16b
+ aesmc v10.16b,v10.16b
+ aese v11.16b,v2.16b
+ aesmc v11.16b,v11.16b
+ ld1 {v2.4s},[x7],#16
+
+ aese v24.16b,v3.16b
+ aesmc v24.16b,v24.16b
+ aese v25.16b,v3.16b
+ aesmc v25.16b,v25.16b
+ aese v26.16b,v3.16b
+ aesmc v26.16b,v26.16b
+ aese v27.16b,v3.16b
+ aesmc v27.16b,v27.16b
+ ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
+ aese v28.16b,v3.16b
+ aesmc v28.16b,v28.16b
+ aese v29.16b,v3.16b
+ aesmc v29.16b,v29.16b
+ aese v30.16b,v3.16b
+ aesmc v30.16b,v30.16b
+ aese v31.16b,v3.16b
+ aesmc v31.16b,v31.16b
+ ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
+ aese v8.16b,v3.16b
+ aesmc v8.16b,v8.16b
+ aese v9.16b,v3.16b
+ aesmc v9.16b,v9.16b
+ aese v10.16b,v3.16b
+ aesmc v10.16b,v10.16b
+ aese v11.16b,v3.16b
+ aesmc v11.16b,v11.16b
+ ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
+ ld1 {v3.4s},[x7],#16
+
+ mov x7, x3
+ aese v24.16b,v2.16b
+ aesmc v24.16b,v24.16b
+ aese v25.16b,v2.16b
+ aesmc v25.16b,v25.16b
+ aese v26.16b,v2.16b
+ aesmc v26.16b,v26.16b
+ aese v27.16b,v2.16b
+ aesmc v27.16b,v27.16b
+ aese v28.16b,v2.16b
+ aesmc v28.16b,v28.16b
+ aese v29.16b,v2.16b
+ aesmc v29.16b,v29.16b
+ aese v30.16b,v2.16b
+ aesmc v30.16b,v30.16b
+ aese v31.16b,v2.16b
+ aesmc v31.16b,v31.16b
+ aese v8.16b,v2.16b
+ aesmc v8.16b,v8.16b
+ aese v9.16b,v2.16b
+ aesmc v9.16b,v9.16b
+ aese v10.16b,v2.16b
+ aesmc v10.16b,v10.16b
+ aese v11.16b,v2.16b
+ aesmc v11.16b,v11.16b
+ ld1 {v2.4s},[x7],#16 // re-pre-load rndkey[0]
+
+ aese v24.16b,v3.16b
+.inst 0xce016084 //eor3 v4.16b,v4.16b,v1.16b,v24.16b
+ orr v24.16b,v0.16b,v0.16b
+ aese v25.16b,v3.16b
+.inst 0xce0164a5 //eor3 v5.16b,v5.16b,v1.16b,v25.16b
+ orr v25.16b,v0.16b,v0.16b
+ aese v26.16b,v3.16b
+.inst 0xce0168c6 //eor3 v6.16b,v6.16b,v1.16b,v26.16b
+ orr v26.16b,v0.16b,v0.16b
+ aese v27.16b,v3.16b
+.inst 0xce016ce7 //eor3 v7.16b,v7.16b,v1.16b,v27.16b
+ orr v27.16b,v0.16b,v0.16b
+ aese v28.16b,v3.16b
+.inst 0xce017210 //eor3 v16.16b,v16.16b,v1.16b,v28.16b
+ orr v28.16b,v0.16b,v0.16b
+ aese v29.16b,v3.16b
+.inst 0xce017631 //eor3 v17.16b,v17.16b,v1.16b,v29.16b
+ orr v29.16b,v0.16b,v0.16b
+ aese v30.16b,v3.16b
+.inst 0xce017a52 //eor3 v18.16b,v18.16b,v1.16b,v30.16b
+ orr v30.16b,v0.16b,v0.16b
+ aese v31.16b,v3.16b
+.inst 0xce017e73 //eor3 v19.16b,v19.16b,v1.16b,v31.16b
+ orr v31.16b,v0.16b,v0.16b
+ aese v8.16b,v3.16b
+.inst 0xce012294 //eor3 v20.16b,v20.16b,v1.16b,v8.16b
+ orr v8.16b,v0.16b,v0.16b
+ aese v9.16b,v3.16b
+.inst 0xce0126b5 //eor3 v21.16b,v21.16b,v1.16b,v9.16b
+ orr v9.16b,v0.16b,v0.16b
+ aese v10.16b,v3.16b
+.inst 0xce012ad6 //eor3 v22.16b,v22.16b,v1.16b,v10.16b
+ orr v10.16b,v0.16b,v0.16b
+ aese v11.16b,v3.16b
+.inst 0xce012ef7 //eor3 v23.16b,v23.16b,v1.16b,v11.16b
+ orr v11.16b,v0.16b,v0.16b
+ ld1 {v3.4s},[x7],#16 // re-pre-load rndkey[1]
+
+ mov v24.s[3],w9
+ mov v25.s[3],w10
+ mov v26.s[3],w12
+ mov v27.s[3],w11
+ st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
+ mov v28.s[3],w13
+ mov v29.s[3],w14
+ mov v30.s[3],w15
+ mov v31.s[3],w19
+ st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64
+ mov v8.s[3],w20
+ mov v9.s[3],w21
+ mov v10.s[3],w22
+ mov v11.s[3],w23
+ st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+
+ mov w6,w5
+
+ add w8,w8,#12
+ subs x2,x2,#12
+ b.hs .Loop12x_ctr32_unroll
+
+ // pop regs from stack when 12 data chunks are interleaved
+ ldp d10,d11,[sp],#32
+ ldp d8,d9,[sp],#32
+ ldp x23,x24,[sp],#16
+ ldp x21,x22,[sp],#16
+ ldp x19,x20,[sp],#16
+
+ add x2,x2,#12
+ cbz x2,.Lctr32_done_unroll
+ sub w8,w8,#12
+
+ cmp x2,#2
+ b.ls .Lctr32_tail_unroll
+
+ cmp x2,#6
+ sub x2,x2,#3 // bias
+ add w8,w8,#3
+ b.lo .Loop3x_ctr32_unroll
+
+ sub x2,x2,#3
+ add w8,w8,#3
+ b.lo .Loop6x_ctr32_unroll
+
+.align 4
+.Loop6x_ctr32_unroll:
+ aese v24.16b,v2.16b
+ aesmc v24.16b,v24.16b
+ aese v25.16b,v2.16b
+ aesmc v25.16b,v25.16b
+ aese v26.16b,v2.16b
+ aesmc v26.16b,v26.16b
+ aese v27.16b,v2.16b
+ aesmc v27.16b,v27.16b
+ aese v28.16b,v2.16b
+ aesmc v28.16b,v28.16b
+ aese v29.16b,v2.16b
+ aesmc v29.16b,v29.16b
+ ld1 {v2.4s},[x7],#16
+ subs w6,w6,#2
+ aese v24.16b,v3.16b
+ aesmc v24.16b,v24.16b
+ aese v25.16b,v3.16b
+ aesmc v25.16b,v25.16b
+ aese v26.16b,v3.16b
+ aesmc v26.16b,v26.16b
+ aese v27.16b,v3.16b
+ aesmc v27.16b,v27.16b
+ aese v28.16b,v3.16b
+ aesmc v28.16b,v28.16b
+ aese v29.16b,v3.16b
+ aesmc v29.16b,v29.16b
+ ld1 {v3.4s},[x7],#16
+ b.gt .Loop6x_ctr32_unroll
+
+ aese v24.16b,v2.16b
+ aesmc v24.16b,v24.16b
+ aese v25.16b,v2.16b
+ aesmc v25.16b,v25.16b
+ aese v26.16b,v2.16b
+ aesmc v26.16b,v26.16b
+ aese v27.16b,v2.16b
+ aesmc v27.16b,v27.16b
+ aese v28.16b,v2.16b
+ aesmc v28.16b,v28.16b
+ aese v29.16b,v2.16b
+ aesmc v29.16b,v29.16b
+ ld1 {v2.4s},[x7],#16
+
+ aese v24.16b,v3.16b
+ aesmc v24.16b,v24.16b
+ aese v25.16b,v3.16b
+ aesmc v25.16b,v25.16b
+ aese v26.16b,v3.16b
+ aesmc v26.16b,v26.16b
+ aese v27.16b,v3.16b
+ aesmc v27.16b,v27.16b
+ aese v28.16b,v3.16b
+ aesmc v28.16b,v28.16b
+ aese v29.16b,v3.16b
+ aesmc v29.16b,v29.16b
+ ld1 {v3.4s},[x7],#16
+
+ aese v24.16b,v2.16b
+ aesmc v24.16b,v24.16b
+ add w9,w8,#1
+ add w10,w8,#2
+ aese v25.16b,v2.16b
+ aesmc v25.16b,v25.16b
+ add w12,w8,#3
+ add w11,w8,#4
+ aese v26.16b,v2.16b
+ aesmc v26.16b,v26.16b
+ add w13,w8,#5
+ add w14,w8,#6
+ rev w9,w9
+ aese v27.16b,v2.16b
+ aesmc v27.16b,v27.16b
+ rev w10,w10
+ rev w12,w12
+ aese v28.16b,v2.16b
+ aesmc v28.16b,v28.16b
+ rev w11,w11
+ rev w13,w13
+ aese v29.16b,v2.16b
+ aesmc v29.16b,v29.16b
+ rev w14,w14
+ ld1 {v2.4s},[x7],#16
+
+ aese v24.16b,v3.16b
+ aesmc v24.16b,v24.16b
+ aese v25.16b,v3.16b
+ aesmc v25.16b,v25.16b
+ ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
+ aese v26.16b,v3.16b
+ aesmc v26.16b,v26.16b
+ aese v27.16b,v3.16b
+ aesmc v27.16b,v27.16b
+ ld1 {v16.16b,v17.16b},[x0],#32
+ aese v28.16b,v3.16b
+ aesmc v28.16b,v28.16b
+ aese v29.16b,v3.16b
+ aesmc v29.16b,v29.16b
+ ld1 {v3.4s},[x7],#16
+
+ mov x7, x3
+ aese v24.16b,v2.16b
+ aesmc v24.16b,v24.16b
+ aese v25.16b,v2.16b
+ aesmc v25.16b,v25.16b
+ aese v26.16b,v2.16b
+ aesmc v26.16b,v26.16b
+ aese v27.16b,v2.16b
+ aesmc v27.16b,v27.16b
+ aese v28.16b,v2.16b
+ aesmc v28.16b,v28.16b
+ aese v29.16b,v2.16b
+ aesmc v29.16b,v29.16b
+ ld1 {v2.4s},[x7],#16 // re-pre-load rndkey[0]
+
+ aese v24.16b,v3.16b
+.inst 0xce016084 //eor3 v4.16b,v4.16b,v1.16b,v24.16b
+ aese v25.16b,v3.16b
+.inst 0xce0164a5 //eor3 v5.16b,v5.16b,v1.16b,v25.16b
+ aese v26.16b,v3.16b
+.inst 0xce0168c6 //eor3 v6.16b,v6.16b,v1.16b,v26.16b
+ aese v27.16b,v3.16b
+.inst 0xce016ce7 //eor3 v7.16b,v7.16b,v1.16b,v27.16b
+ aese v28.16b,v3.16b
+.inst 0xce017210 //eor3 v16.16b,v16.16b,v1.16b,v28.16b
+ aese v29.16b,v3.16b
+.inst 0xce017631 //eor3 v17.16b,v17.16b,v1.16b,v29.16b
+ ld1 {v3.4s},[x7],#16 // re-pre-load rndkey[1]
+
+ orr v24.16b,v0.16b,v0.16b
+ orr v25.16b,v0.16b,v0.16b
+ orr v26.16b,v0.16b,v0.16b
+ orr v27.16b,v0.16b,v0.16b
+ orr v28.16b,v0.16b,v0.16b
+ orr v29.16b,v0.16b,v0.16b
+
+ mov v24.s[3],w9
+ mov v25.s[3],w10
+ st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
+ mov v26.s[3],w12
+ mov v27.s[3],w11
+ st1 {v16.16b,v17.16b},[x1],#32
+ mov v28.s[3],w13
+ mov v29.s[3],w14
+
+ cbz x2,.Lctr32_done_unroll
+ mov w6,w5
+
+ cmp x2,#2
+ b.ls .Lctr32_tail_unroll
+
+ sub x2,x2,#3 // bias
+ add w8,w8,#3
+ b .Loop3x_ctr32_unroll
+
+.align 4
+.Loop3x_ctr32_unroll:
+ aese v24.16b,v2.16b
+ aesmc v24.16b,v24.16b
+ aese v25.16b,v2.16b
+ aesmc v25.16b,v25.16b
+ aese v26.16b,v2.16b
+ aesmc v26.16b,v26.16b
+ ld1 {v2.4s},[x7],#16
+ subs w6,w6,#2
+ aese v24.16b,v3.16b
+ aesmc v24.16b,v24.16b
+ aese v25.16b,v3.16b
+ aesmc v25.16b,v25.16b
+ aese v26.16b,v3.16b
+ aesmc v26.16b,v26.16b
+ ld1 {v3.4s},[x7],#16
+ b.gt .Loop3x_ctr32_unroll
+
+ aese v24.16b,v2.16b
+ aesmc v9.16b,v24.16b
+ aese v25.16b,v2.16b
+ aesmc v10.16b,v25.16b
+ ld1 {v4.16b,v5.16b,v6.16b},[x0],#48
+ orr v24.16b,v0.16b,v0.16b
+ aese v26.16b,v2.16b
+ aesmc v26.16b,v26.16b
+ ld1 {v2.4s},[x7],#16
+ orr v25.16b,v0.16b,v0.16b
+ aese v9.16b,v3.16b
+ aesmc v9.16b,v9.16b
+ aese v10.16b,v3.16b
+ aesmc v10.16b,v10.16b
+ aese v26.16b,v3.16b
+ aesmc v11.16b,v26.16b
+ ld1 {v3.4s},[x7],#16
+ orr v26.16b,v0.16b,v0.16b
+ add w9,w8,#1
+ aese v9.16b,v2.16b
+ aesmc v9.16b,v9.16b
+ aese v10.16b,v2.16b
+ aesmc v10.16b,v10.16b
+ add w10,w8,#2
+ aese v11.16b,v2.16b
+ aesmc v11.16b,v11.16b
+ ld1 {v2.4s},[x7],#16
+ add w8,w8,#3
+ aese v9.16b,v3.16b
+ aesmc v9.16b,v9.16b
+ aese v10.16b,v3.16b
+ aesmc v10.16b,v10.16b
+
+ rev w9,w9
+ aese v11.16b,v3.16b
+ aesmc v11.16b,v11.16b
+ ld1 {v3.4s},[x7],#16
+ mov v24.s[3], w9
+ mov x7,x3
+ rev w10,w10
+ aese v9.16b,v2.16b
+ aesmc v9.16b,v9.16b
+
+ aese v10.16b,v2.16b
+ aesmc v10.16b,v10.16b
+ mov v25.s[3], w10
+ rev w12,w8
+ aese v11.16b,v2.16b
+ aesmc v11.16b,v11.16b
+ mov v26.s[3], w12
+
+ aese v9.16b,v3.16b
+ aese v10.16b,v3.16b
+ aese v11.16b,v3.16b
+
+.inst 0xce012484 //eor3 v4.16b,v4.16b,v1.16b,v9.16b
+ ld1 {v2.4s},[x7],#16 // re-pre-load rndkey[0]
+.inst 0xce0128a5 //eor3 v5.16b,v5.16b,v1.16b,v10.16b
+ mov w6,w5
+.inst 0xce012cc6 //eor3 v6.16b,v6.16b,v1.16b,v11.16b
+ ld1 {v3.4s},[x7],#16 // re-pre-load rndkey[1]
+ st1 {v4.16b,v5.16b,v6.16b},[x1],#48
+
+ cbz x2,.Lctr32_done_unroll
+
+.Lctr32_tail_unroll:
+ cmp x2,#1
+ b.eq .Lctr32_tail_1_unroll
+
+.Lctr32_tail_2_unroll:
+ aese v24.16b,v2.16b
+ aesmc v24.16b,v24.16b
+ aese v25.16b,v2.16b
+ aesmc v25.16b,v25.16b
+ ld1 {v2.4s},[x7],#16
+ subs w6,w6,#2
+ aese v24.16b,v3.16b
+ aesmc v24.16b,v24.16b
+ aese v25.16b,v3.16b
+ aesmc v25.16b,v25.16b
+ ld1 {v3.4s},[x7],#16
+ b.gt .Lctr32_tail_2_unroll
+
+ aese v24.16b,v2.16b
+ aesmc v24.16b,v24.16b
+ aese v25.16b,v2.16b
+ aesmc v25.16b,v25.16b
+ ld1 {v2.4s},[x7],#16
+ aese v24.16b,v3.16b
+ aesmc v24.16b,v24.16b
+ aese v25.16b,v3.16b
+ aesmc v25.16b,v25.16b
+ ld1 {v3.4s},[x7],#16
+ ld1 {v4.16b,v5.16b},[x0],#32
+ aese v24.16b,v2.16b
+ aesmc v24.16b,v24.16b
+ aese v25.16b,v2.16b
+ aesmc v25.16b,v25.16b
+ ld1 {v2.4s},[x7],#16
+ aese v24.16b,v3.16b
+ aesmc v24.16b,v24.16b
+ aese v25.16b,v3.16b
+ aesmc v25.16b,v25.16b
+ ld1 {v3.4s},[x7],#16
+ aese v24.16b,v2.16b
+ aesmc v24.16b,v24.16b
+ aese v25.16b,v2.16b
+ aesmc v25.16b,v25.16b
+ aese v24.16b,v3.16b
+ aese v25.16b,v3.16b
+
+.inst 0xce016084 //eor3 v4.16b,v4.16b,v1.16b,v24.16b
+.inst 0xce0164a5 //eor3 v5.16b,v5.16b,v1.16b,v25.16b
+ st1 {v4.16b,v5.16b},[x1],#32
+ b .Lctr32_done_unroll
+
+.Lctr32_tail_1_unroll:
+ aese v24.16b,v2.16b
+ aesmc v24.16b,v24.16b
+ ld1 {v2.4s},[x7],#16
+ subs w6,w6,#2
+ aese v24.16b,v3.16b
+ aesmc v24.16b,v24.16b
+ ld1 {v3.4s},[x7],#16
+ b.gt .Lctr32_tail_1_unroll
+
+ aese v24.16b,v2.16b
+ aesmc v24.16b,v24.16b
+ ld1 {v2.4s},[x7],#16
+ aese v24.16b,v3.16b
+ aesmc v24.16b,v24.16b
+ ld1 {v3.4s},[x7],#16
+ ld1 {v4.16b},[x0]
+ aese v24.16b,v2.16b
+ aesmc v24.16b,v24.16b
+ ld1 {v2.4s},[x7],#16
+ aese v24.16b,v3.16b
+ aesmc v24.16b,v24.16b
+ ld1 {v3.4s},[x7],#16
+ aese v24.16b,v2.16b
+ aesmc v24.16b,v24.16b
+ aese v24.16b,v3.16b
+
+.inst 0xce016084 //eor3 v4.16b,v4.16b,v1.16b,v24.16b
+ st1 {v4.16b},[x1],#16
+
+.Lctr32_done_unroll:
+ ldp d8,d9,[sp, #16]
+ ldp d10,d11,[sp, #32]
+ ldp d12,d13,[sp, #48]
+ ldp d14,d15,[sp, #64]
+ ldr x29,[sp],#80
+ ret
+.size aes_v8_ctr32_encrypt_blocks_unroll12_eor3,.-aes_v8_ctr32_encrypt_blocks_unroll12_eor3
.globl aes_v8_ctr32_encrypt_blocks
.type aes_v8_ctr32_encrypt_blocks,%function
.align 5
@@ -3116,7 +3841,7 @@ aes_v8_xts_decrypt:
cbnz x2,.Lxts_dec_1st_done
ld1 {v0.16b},[x0],#16
- // Decrypt the last secod block to get the last plain text block
+ // Decrypt the last second block to get the last plain text block
.Lxts_dec_1st_done:
eor v26.16b,v0.16b,v8.16b
ldr w6,[x3,#240]
diff --git a/sys/crypto/openssl/aarch64/arm64cpuid.S b/sys/crypto/openssl/aarch64/arm64cpuid.S
index 52c6ee5b65d3..81530bda1c67 100644
--- a/sys/crypto/openssl/aarch64/arm64cpuid.S
+++ b/sys/crypto/openssl/aarch64/arm64cpuid.S
@@ -57,14 +57,46 @@ _armv8_pmull_probe:
ret
.size _armv8_pmull_probe,.-_armv8_pmull_probe
+.globl _armv8_sm4_probe
+.type _armv8_sm4_probe,%function
+_armv8_sm4_probe:
+ AARCH64_VALID_CALL_TARGET
+.inst 0xcec08400 // sm4e v0.4s, v0.4s
+ ret
+.size _armv8_sm4_probe,.-_armv8_sm4_probe
+
.globl _armv8_sha512_probe
.type _armv8_sha512_probe,%function
_armv8_sha512_probe:
AARCH64_VALID_CALL_TARGET
-.long 0xcec08000 // sha512su0 v0.2d,v0.2d
+.inst 0xcec08000 // sha512su0 v0.2d,v0.2d
ret
.size _armv8_sha512_probe,.-_armv8_sha512_probe
+.globl _armv8_eor3_probe
+.type _armv8_eor3_probe,%function
+_armv8_eor3_probe:
+ AARCH64_VALID_CALL_TARGET
+.inst 0xce010800 // eor3 v0.16b, v0.16b, v1.16b, v2.16b
+ ret
+.size _armv8_eor3_probe,.-_armv8_eor3_probe
+
+.globl _armv8_sve_probe
+.type _armv8_sve_probe,%function
+_armv8_sve_probe:
+ AARCH64_VALID_CALL_TARGET
+.inst 0x04a03000 // eor z0.d,z0.d,z0.d
+ ret
+.size _armv8_sve_probe,.-_armv8_sve_probe
+
+.globl _armv8_sve2_probe
+.type _armv8_sve2_probe,%function
+_armv8_sve2_probe:
+ AARCH64_VALID_CALL_TARGET
+.inst 0x04e03400 // xar z0.d,z0.d,z0.d
+ ret
+.size _armv8_sve2_probe,.-_armv8_sve2_probe
+
.globl _armv8_cpuid_probe
.type _armv8_cpuid_probe,%function
_armv8_cpuid_probe:
@@ -73,6 +105,14 @@ _armv8_cpuid_probe:
ret
.size _armv8_cpuid_probe,.-_armv8_cpuid_probe
+.globl _armv8_sm3_probe
+.type _armv8_sm3_probe,%function
+_armv8_sm3_probe:
+ AARCH64_VALID_CALL_TARGET
+.inst 0xce63c004 // sm3partw1 v4.4s, v0.4s, v3.4s
+ ret
+.size _armv8_sm3_probe,.-_armv8_sm3_probe
+
.globl OPENSSL_cleanse
.type OPENSSL_cleanse,%function
.align 5
@@ -138,3 +178,98 @@ CRYPTO_memcmp:
lsr w0,w0,#31
ret
.size CRYPTO_memcmp,.-CRYPTO_memcmp
+
+.globl _armv8_rng_probe
+.type _armv8_rng_probe,%function
+_armv8_rng_probe:
+ AARCH64_VALID_CALL_TARGET
+ mrs x0, s3_3_c2_c4_0 // rndr
+ mrs x0, s3_3_c2_c4_1 // rndrrs
+ ret
+.size _armv8_rng_probe,.-_armv8_rng_probe
+// Fill buffer with Randomly Generated Bytes
+// inputs: char * in x0 - Pointer to buffer
+// size_t in x1 - Number of bytes to write to buffer
+// outputs: size_t in x0 - Number of bytes successfully written to buffer
+.globl OPENSSL_rndr_asm
+.type OPENSSL_rndr_asm,%function
+.align 4
+OPENSSL_rndr_asm:
+ AARCH64_VALID_CALL_TARGET
+ mov x2,xzr
+ mov x3,xzr
+
+.align 4
+.Loop_rndr:
+ cmp x1,#0
+ b.eq .rndr_done
+ mov x3,xzr
+ mrs x3,s3_3_c2_c4_0
+ b.eq .rndr_done
+
+ cmp x1,#8
+ b.lt .Loop_single_byte_rndr
+
+ str x3,[x0]
+ add x0,x0,#8
+ add x2,x2,#8
+ subs x1,x1,#8
+ b.ge .Loop_rndr
+
+.align 4
+.Loop_single_byte_rndr:
+ strb w3,[x0]
+ lsr x3,x3,#8
+ add x2,x2,#1
+ add x0,x0,#1
+ subs x1,x1,#1
+ b.gt .Loop_single_byte_rndr
+
+.align 4
+.rndr_done:
+ mov x0,x2
+ ret
+.size OPENSSL_rndr_asm,.-OPENSSL_rndr_asm
+// Fill buffer with Randomly Generated Bytes
+// inputs: char * in x0 - Pointer to buffer
+// size_t in x1 - Number of bytes to write to buffer
+// outputs: size_t in x0 - Number of bytes successfully written to buffer
+.globl OPENSSL_rndrrs_asm
+.type OPENSSL_rndrrs_asm,%function
+.align 4
+OPENSSL_rndrrs_asm:
+ AARCH64_VALID_CALL_TARGET
+ mov x2,xzr
+ mov x3,xzr
+
+.align 4
+.Loop_rndrrs:
+ cmp x1,#0
+ b.eq .rndrrs_done
+ mov x3,xzr
+ mrs x3,s3_3_c2_c4_1
+ b.eq .rndrrs_done
+
+ cmp x1,#8
+ b.lt .Loop_single_byte_rndrrs
+
+ str x3,[x0]
+ add x0,x0,#8
+ add x2,x2,#8
+ subs x1,x1,#8
+ b.ge .Loop_rndrrs
+
+.align 4
+.Loop_single_byte_rndrrs:
+ strb w3,[x0]
+ lsr x3,x3,#8
+ add x2,x2,#1
+ add x0,x0,#1
+ subs x1,x1,#1
+ b.gt .Loop_single_byte_rndrrs
+
+.align 4
+.rndrrs_done:
+ mov x0,x2
+ ret
+.size OPENSSL_rndrrs_asm,.-OPENSSL_rndrrs_asm
diff --git a/sys/crypto/openssl/aarch64/armv8-mont.S b/sys/crypto/openssl/aarch64/armv8-mont.S
index b429f39ee326..a12dcf3dcfc0 100644
--- a/sys/crypto/openssl/aarch64/armv8-mont.S
+++ b/sys/crypto/openssl/aarch64/armv8-mont.S
@@ -2131,6 +2131,7 @@ __bn_mul4x_mont:
AARCH64_VALIDATE_LINK_REGISTER
ret
.size __bn_mul4x_mont,.-__bn_mul4x_mont
+.section .rodata
.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.align 2
.align 4
diff --git a/sys/crypto/openssl/aarch64/bsaes-armv8.S b/sys/crypto/openssl/aarch64/bsaes-armv8.S
new file mode 100644
index 000000000000..cd43f2db7e21
--- /dev/null
+++ b/sys/crypto/openssl/aarch64/bsaes-armv8.S
@@ -0,0 +1,2356 @@
+/* Do not modify. This file is auto-generated from bsaes-armv8.pl. */
+// Copyright 2021-2025 The OpenSSL Project Authors. All Rights Reserved.
+//
+// Licensed under the OpenSSL license (the "License"). You may not use
+// this file except in compliance with the License. You can obtain a copy
+// in the file LICENSE in the source distribution or at
+// https://www.openssl.org/source/license.html
+//
+// ====================================================================
+// Written by Ben Avison <bavison@riscosopen.org> for the OpenSSL
+// project. Rights for redistribution and usage in source and binary
+// forms are granted according to the OpenSSL license.
+// ====================================================================
+//
+// This implementation is a translation of bsaes-armv7 for AArch64.
+// No attempt has been made to carry across the build switches for
+// kernel targets, since the Linux kernel crypto support has moved on
+// from when it was based on OpenSSL.
+
+// A lot of hand-scheduling has been performed. Consequently, this code
+// doesn't factor out neatly into macros in the same way that the
+// AArch32 version did, and there is little to be gained by wrapping it
+// up in Perl, and it is presented as pure assembly.
+
+
+#include "crypto/arm_arch.h"
+
+.text
+
+
+
+
+
+.type _bsaes_decrypt8,%function
+.align 4
+// On entry:
+// x9 -> key (previously expanded using _bsaes_key_convert)
+// x10 = number of rounds
+// v0-v7 input data
+// On exit:
+// x9-x11 corrupted
+// other general-purpose registers preserved
+// v0-v7 output data
+// v11-v15 preserved
+// other SIMD registers corrupted
+_bsaes_decrypt8:
+ ldr q8, [x9], #16
+ adrp x11, .LM0ISR
+ add x11, x11, #:lo12:.LM0ISR
+ movi v9.16b, #0x55
+ ldr q10, [x11], #16
+ movi v16.16b, #0x33
+ movi v17.16b, #0x0f
+ sub x10, x10, #1
+ eor v0.16b, v0.16b, v8.16b
+ eor v1.16b, v1.16b, v8.16b
+ eor v2.16b, v2.16b, v8.16b
+ eor v4.16b, v4.16b, v8.16b
+ eor v3.16b, v3.16b, v8.16b
+ eor v5.16b, v5.16b, v8.16b
+ tbl v0.16b, {v0.16b}, v10.16b
+ tbl v1.16b, {v1.16b}, v10.16b
+ tbl v2.16b, {v2.16b}, v10.16b
+ tbl v4.16b, {v4.16b}, v10.16b
+ eor v6.16b, v6.16b, v8.16b
+ eor v7.16b, v7.16b, v8.16b
+ tbl v3.16b, {v3.16b}, v10.16b
+ tbl v5.16b, {v5.16b}, v10.16b
+ tbl v6.16b, {v6.16b}, v10.16b
+ ushr v8.2d, v0.2d, #1
+ tbl v7.16b, {v7.16b}, v10.16b
+ ushr v10.2d, v4.2d, #1
+ ushr v18.2d, v2.2d, #1
+ eor v8.16b, v8.16b, v1.16b
+ ushr v19.2d, v6.2d, #1
+ eor v10.16b, v10.16b, v5.16b
+ eor v18.16b, v18.16b, v3.16b
+ and v8.16b, v8.16b, v9.16b
+ eor v19.16b, v19.16b, v7.16b
+ and v10.16b, v10.16b, v9.16b
+ and v18.16b, v18.16b, v9.16b
+ eor v1.16b, v1.16b, v8.16b
+ shl v8.2d, v8.2d, #1
+ and v9.16b, v19.16b, v9.16b
+ eor v5.16b, v5.16b, v10.16b
+ shl v10.2d, v10.2d, #1
+ eor v3.16b, v3.16b, v18.16b
+ shl v18.2d, v18.2d, #1
+ eor v0.16b, v0.16b, v8.16b
+ shl v8.2d, v9.2d, #1
+ eor v7.16b, v7.16b, v9.16b
+ eor v4.16b, v4.16b, v10.16b
+ eor v2.16b, v2.16b, v18.16b
+ ushr v9.2d, v1.2d, #2
+ eor v6.16b, v6.16b, v8.16b
+ ushr v8.2d, v0.2d, #2
+ ushr v10.2d, v5.2d, #2
+ ushr v18.2d, v4.2d, #2
+ eor v9.16b, v9.16b, v3.16b
+ eor v8.16b, v8.16b, v2.16b
+ eor v10.16b, v10.16b, v7.16b
+ eor v18.16b, v18.16b, v6.16b
+ and v9.16b, v9.16b, v16.16b
+ and v8.16b, v8.16b, v16.16b
+ and v10.16b, v10.16b, v16.16b
+ and v16.16b, v18.16b, v16.16b
+ eor v3.16b, v3.16b, v9.16b
+ shl v9.2d, v9.2d, #2
+ eor v2.16b, v2.16b, v8.16b
+ shl v8.2d, v8.2d, #2
+ eor v7.16b, v7.16b, v10.16b
+ shl v10.2d, v10.2d, #2
+ eor v6.16b, v6.16b, v16.16b
+ shl v16.2d, v16.2d, #2
+ eor v1.16b, v1.16b, v9.16b
+ eor v0.16b, v0.16b, v8.16b
+ eor v5.16b, v5.16b, v10.16b
+ eor v4.16b, v4.16b, v16.16b
+ ushr v8.2d, v3.2d, #4
+ ushr v9.2d, v2.2d, #4
+ ushr v10.2d, v1.2d, #4
+ ushr v16.2d, v0.2d, #4
+ eor v8.16b, v8.16b, v7.16b
+ eor v9.16b, v9.16b, v6.16b
+ eor v10.16b, v10.16b, v5.16b
+ eor v16.16b, v16.16b, v4.16b
+ and v8.16b, v8.16b, v17.16b
+ and v9.16b, v9.16b, v17.16b
+ and v10.16b, v10.16b, v17.16b
+ and v16.16b, v16.16b, v17.16b
+ eor v7.16b, v7.16b, v8.16b
+ shl v8.2d, v8.2d, #4
+ eor v6.16b, v6.16b, v9.16b
+ shl v9.2d, v9.2d, #4
+ eor v5.16b, v5.16b, v10.16b
+ shl v10.2d, v10.2d, #4
+ eor v4.16b, v4.16b, v16.16b
+ shl v16.2d, v16.2d, #4
+ eor v3.16b, v3.16b, v8.16b
+ eor v2.16b, v2.16b, v9.16b
+ eor v1.16b, v1.16b, v10.16b
+ eor v0.16b, v0.16b, v16.16b
+ b .Ldec_sbox
+.align 4
+.Ldec_loop:
+ ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x9], #64
+ ldp q8, q9, [x9], #32
+ eor v0.16b, v16.16b, v0.16b
+ ldr q10, [x9], #16
+ eor v1.16b, v17.16b, v1.16b
+ ldr q16, [x9], #16
+ eor v2.16b, v18.16b, v2.16b
+ eor v3.16b, v19.16b, v3.16b
+ eor v4.16b, v8.16b, v4.16b
+ eor v5.16b, v9.16b, v5.16b
+ eor v6.16b, v10.16b, v6.16b
+ eor v7.16b, v16.16b, v7.16b
+ tbl v0.16b, {v0.16b}, v28.16b
+ tbl v1.16b, {v1.16b}, v28.16b
+ tbl v2.16b, {v2.16b}, v28.16b
+ tbl v3.16b, {v3.16b}, v28.16b
+ tbl v4.16b, {v4.16b}, v28.16b
+ tbl v5.16b, {v5.16b}, v28.16b
+ tbl v6.16b, {v6.16b}, v28.16b
+ tbl v7.16b, {v7.16b}, v28.16b
+.Ldec_sbox:
+ eor v1.16b, v1.16b, v4.16b
+ eor v3.16b, v3.16b, v4.16b
+ subs x10, x10, #1
+ eor v4.16b, v4.16b, v7.16b
+ eor v2.16b, v2.16b, v7.16b
+ eor v1.16b, v1.16b, v6.16b
+ eor v6.16b, v6.16b, v4.16b
+ eor v2.16b, v2.16b, v5.16b
+ eor v0.16b, v0.16b, v1.16b
+ eor v7.16b, v7.16b, v6.16b
+ eor v8.16b, v6.16b, v2.16b
+ and v9.16b, v4.16b, v6.16b
+ eor v10.16b, v2.16b, v6.16b
+ eor v3.16b, v3.16b, v0.16b
+ eor v5.16b, v5.16b, v0.16b
+ eor v16.16b, v7.16b, v4.16b
+ eor v17.16b, v4.16b, v0.16b
+ and v18.16b, v0.16b, v2.16b
+ eor v19.16b, v7.16b, v4.16b
+ eor v1.16b, v1.16b, v3.16b
+ eor v20.16b, v3.16b, v0.16b
+ eor v21.16b, v5.16b, v2.16b
+ eor v22.16b, v3.16b, v7.16b
+ and v8.16b, v17.16b, v8.16b
+ orr v17.16b, v3.16b, v5.16b
+ eor v23.16b, v1.16b, v6.16b
+ eor v24.16b, v20.16b, v16.16b
+ eor v25.16b, v1.16b, v5.16b
+ orr v26.16b, v20.16b, v21.16b
+ and v20.16b, v20.16b, v21.16b
+ and v27.16b, v7.16b, v1.16b
+ eor v21.16b, v21.16b, v23.16b
+ orr v28.16b, v16.16b, v23.16b
+ orr v29.16b, v22.16b, v25.16b
+ eor v26.16b, v26.16b, v8.16b
+ and v16.16b, v16.16b, v23.16b
+ and v22.16b, v22.16b, v25.16b
+ and v21.16b, v24.16b, v21.16b
+ eor v8.16b, v28.16b, v8.16b
+ eor v23.16b, v5.16b, v2.16b
+ eor v24.16b, v1.16b, v6.16b
+ eor v16.16b, v16.16b, v22.16b
+ eor v22.16b, v3.16b, v0.16b
+ eor v25.16b, v29.16b, v21.16b
+ eor v21.16b, v26.16b, v21.16b
+ eor v8.16b, v8.16b, v20.16b
+ eor v26.16b, v23.16b, v24.16b
+ eor v16.16b, v16.16b, v20.16b
+ eor v28.16b, v22.16b, v19.16b
+ eor v20.16b, v25.16b, v20.16b
+ eor v9.16b, v21.16b, v9.16b
+ eor v8.16b, v8.16b, v18.16b
+ eor v18.16b, v5.16b, v1.16b
+ eor v21.16b, v16.16b, v17.16b
+ eor v16.16b, v16.16b, v17.16b
+ eor v17.16b, v20.16b, v27.16b
+ eor v20.16b, v3.16b, v7.16b
+ eor v25.16b, v9.16b, v8.16b
+ eor v27.16b, v0.16b, v4.16b
+ and v29.16b, v9.16b, v17.16b
+ eor v30.16b, v8.16b, v29.16b
+ eor v31.16b, v21.16b, v29.16b
+ eor v29.16b, v21.16b, v29.16b
+ bsl v30.16b, v17.16b, v21.16b
+ bsl v31.16b, v9.16b, v8.16b
+ bsl v16.16b, v30.16b, v29.16b
+ bsl v21.16b, v29.16b, v30.16b
+ eor v8.16b, v31.16b, v30.16b
+ and v1.16b, v1.16b, v31.16b
+ and v9.16b, v16.16b, v31.16b
+ and v6.16b, v6.16b, v30.16b
+ eor v16.16b, v17.16b, v21.16b
+ and v4.16b, v4.16b, v30.16b
+ eor v17.16b, v8.16b, v30.16b
+ and v21.16b, v24.16b, v8.16b
+ eor v9.16b, v9.16b, v25.16b
+ and v19.16b, v19.16b, v8.16b
+ eor v24.16b, v30.16b, v16.16b
+ eor v25.16b, v30.16b, v16.16b
+ and v7.16b, v7.16b, v17.16b
+ and v10.16b, v10.16b, v16.16b
+ eor v29.16b, v9.16b, v16.16b
+ eor v30.16b, v31.16b, v9.16b
+ and v0.16b, v24.16b, v0.16b
+ and v9.16b, v18.16b, v9.16b
+ and v2.16b, v25.16b, v2.16b
+ eor v10.16b, v10.16b, v6.16b
+ eor v18.16b, v29.16b, v16.16b
+ and v5.16b, v30.16b, v5.16b
+ eor v24.16b, v8.16b, v29.16b
+ and v25.16b, v26.16b, v29.16b
+ and v26.16b, v28.16b, v29.16b
+ eor v8.16b, v8.16b, v29.16b
+ eor v17.16b, v17.16b, v18.16b
+ eor v5.16b, v1.16b, v5.16b
+ and v23.16b, v24.16b, v23.16b
+ eor v21.16b, v21.16b, v25.16b
+ eor v19.16b, v19.16b, v26.16b
+ eor v0.16b, v4.16b, v0.16b
+ and v3.16b, v17.16b, v3.16b
+ eor v1.16b, v9.16b, v1.16b
+ eor v9.16b, v25.16b, v23.16b
+ eor v5.16b, v5.16b, v21.16b
+ eor v2.16b, v6.16b, v2.16b
+ and v6.16b, v8.16b, v22.16b
+ eor v3.16b, v7.16b, v3.16b
+ and v8.16b, v20.16b, v18.16b
+ eor v10.16b, v10.16b, v9.16b
+ eor v0.16b, v0.16b, v19.16b
+ eor v9.16b, v1.16b, v9.16b
+ eor v1.16b, v2.16b, v21.16b
+ eor v3.16b, v3.16b, v19.16b
+ and v16.16b, v27.16b, v16.16b
+ eor v17.16b, v26.16b, v6.16b
+ eor v6.16b, v8.16b, v7.16b
+ eor v7.16b, v1.16b, v9.16b
+ eor v1.16b, v5.16b, v3.16b
+ eor v2.16b, v10.16b, v3.16b
+ eor v4.16b, v16.16b, v4.16b
+ eor v8.16b, v6.16b, v17.16b
+ eor v5.16b, v9.16b, v3.16b
+ eor v9.16b, v0.16b, v1.16b
+ eor v6.16b, v7.16b, v1.16b
+ eor v0.16b, v4.16b, v17.16b
+ eor v4.16b, v8.16b, v7.16b
+ eor v7.16b, v9.16b, v2.16b
+ eor v8.16b, v3.16b, v0.16b
+ eor v7.16b, v7.16b, v5.16b
+ eor v3.16b, v4.16b, v7.16b
+ eor v4.16b, v7.16b, v0.16b
+ eor v7.16b, v8.16b, v3.16b
+ bcc .Ldec_done
+ ext v8.16b, v0.16b, v0.16b, #8
+ ext v9.16b, v1.16b, v1.16b, #8
+ ldr q28, [x11] // load from .LISR in common case (x10 > 0)
+ ext v10.16b, v6.16b, v6.16b, #8
+ ext v16.16b, v3.16b, v3.16b, #8
+ ext v17.16b, v5.16b, v5.16b, #8
+ ext v18.16b, v4.16b, v4.16b, #8
+ eor v8.16b, v8.16b, v0.16b
+ eor v9.16b, v9.16b, v1.16b
+ eor v10.16b, v10.16b, v6.16b
+ eor v16.16b, v16.16b, v3.16b
+ eor v17.16b, v17.16b, v5.16b
+ ext v19.16b, v2.16b, v2.16b, #8
+ ext v20.16b, v7.16b, v7.16b, #8
+ eor v18.16b, v18.16b, v4.16b
+ eor v6.16b, v6.16b, v8.16b
+ eor v8.16b, v2.16b, v10.16b
+ eor v4.16b, v4.16b, v9.16b
+ eor v2.16b, v19.16b, v2.16b
+ eor v9.16b, v20.16b, v7.16b
+ eor v0.16b, v0.16b, v16.16b
+ eor v1.16b, v1.16b, v16.16b
+ eor v6.16b, v6.16b, v17.16b
+ eor v8.16b, v8.16b, v16.16b
+ eor v7.16b, v7.16b, v18.16b
+ eor v4.16b, v4.16b, v16.16b
+ eor v2.16b, v3.16b, v2.16b
+ eor v1.16b, v1.16b, v17.16b
+ eor v3.16b, v5.16b, v9.16b
+ eor v5.16b, v8.16b, v17.16b
+ eor v7.16b, v7.16b, v17.16b
+ ext v8.16b, v0.16b, v0.16b, #12
+ ext v9.16b, v6.16b, v6.16b, #12
+ ext v10.16b, v4.16b, v4.16b, #12
+ ext v16.16b, v1.16b, v1.16b, #12
+ ext v17.16b, v5.16b, v5.16b, #12
+ ext v18.16b, v7.16b, v7.16b, #12
+ eor v0.16b, v0.16b, v8.16b
+ eor v6.16b, v6.16b, v9.16b
+ eor v4.16b, v4.16b, v10.16b
+ ext v19.16b, v2.16b, v2.16b, #12
+ ext v20.16b, v3.16b, v3.16b, #12
+ eor v1.16b, v1.16b, v16.16b
+ eor v5.16b, v5.16b, v17.16b
+ eor v7.16b, v7.16b, v18.16b
+ eor v2.16b, v2.16b, v19.16b
+ eor v16.16b, v16.16b, v0.16b
+ eor v3.16b, v3.16b, v20.16b
+ eor v17.16b, v17.16b, v4.16b
+ eor v10.16b, v10.16b, v6.16b
+ ext v0.16b, v0.16b, v0.16b, #8
+ eor v9.16b, v9.16b, v1.16b
+ ext v1.16b, v1.16b, v1.16b, #8
+ eor v8.16b, v8.16b, v3.16b
+ eor v16.16b, v16.16b, v3.16b
+ eor v18.16b, v18.16b, v5.16b
+ eor v19.16b, v19.16b, v7.16b
+ ext v21.16b, v5.16b, v5.16b, #8
+ ext v5.16b, v7.16b, v7.16b, #8
+ eor v7.16b, v20.16b, v2.16b
+ ext v4.16b, v4.16b, v4.16b, #8
+ ext v20.16b, v3.16b, v3.16b, #8
+ eor v17.16b, v17.16b, v3.16b
+ ext v2.16b, v2.16b, v2.16b, #8
+ eor v3.16b, v10.16b, v3.16b
+ ext v10.16b, v6.16b, v6.16b, #8
+ eor v0.16b, v0.16b, v8.16b
+ eor v1.16b, v1.16b, v16.16b
+ eor v5.16b, v5.16b, v18.16b
+ eor v3.16b, v3.16b, v4.16b
+ eor v7.16b, v20.16b, v7.16b
+ eor v6.16b, v2.16b, v19.16b
+ eor v4.16b, v21.16b, v17.16b
+ eor v2.16b, v10.16b, v9.16b
+ bne .Ldec_loop
+ ldr q28, [x11, #16]! // load from .LISRM0 on last round (x10 == 0)
+ b .Ldec_loop
+.align 4
+.Ldec_done:
+ ushr v8.2d, v0.2d, #1
+ movi v9.16b, #0x55
+ ldr q10, [x9]
+ ushr v16.2d, v2.2d, #1
+ movi v17.16b, #0x33
+ ushr v18.2d, v6.2d, #1
+ movi v19.16b, #0x0f
+ eor v8.16b, v8.16b, v1.16b
+ ushr v20.2d, v3.2d, #1
+ eor v16.16b, v16.16b, v7.16b
+ eor v18.16b, v18.16b, v4.16b
+ and v8.16b, v8.16b, v9.16b
+ eor v20.16b, v20.16b, v5.16b
+ and v16.16b, v16.16b, v9.16b
+ and v18.16b, v18.16b, v9.16b
+ shl v21.2d, v8.2d, #1
+ eor v1.16b, v1.16b, v8.16b
+ and v8.16b, v20.16b, v9.16b
+ eor v7.16b, v7.16b, v16.16b
+ shl v9.2d, v16.2d, #1
+ eor v4.16b, v4.16b, v18.16b
+ shl v16.2d, v18.2d, #1
+ eor v0.16b, v0.16b, v21.16b
+ shl v18.2d, v8.2d, #1
+ eor v5.16b, v5.16b, v8.16b
+ eor v2.16b, v2.16b, v9.16b
+ eor v6.16b, v6.16b, v16.16b
+ ushr v8.2d, v1.2d, #2
+ eor v3.16b, v3.16b, v18.16b
+ ushr v9.2d, v0.2d, #2
+ ushr v16.2d, v7.2d, #2
+ ushr v18.2d, v2.2d, #2
+ eor v8.16b, v8.16b, v4.16b
+ eor v9.16b, v9.16b, v6.16b
+ eor v16.16b, v16.16b, v5.16b
+ eor v18.16b, v18.16b, v3.16b
+ and v8.16b, v8.16b, v17.16b
+ and v9.16b, v9.16b, v17.16b
+ and v16.16b, v16.16b, v17.16b
+ and v17.16b, v18.16b, v17.16b
+ eor v4.16b, v4.16b, v8.16b
+ shl v8.2d, v8.2d, #2
+ eor v6.16b, v6.16b, v9.16b
+ shl v9.2d, v9.2d, #2
+ eor v5.16b, v5.16b, v16.16b
+ shl v16.2d, v16.2d, #2
+ eor v3.16b, v3.16b, v17.16b
+ shl v17.2d, v17.2d, #2
+ eor v1.16b, v1.16b, v8.16b
+ eor v0.16b, v0.16b, v9.16b
+ eor v7.16b, v7.16b, v16.16b
+ eor v2.16b, v2.16b, v17.16b
+ ushr v8.2d, v4.2d, #4
+ ushr v9.2d, v6.2d, #4
+ ushr v16.2d, v1.2d, #4
+ ushr v17.2d, v0.2d, #4
+ eor v8.16b, v8.16b, v5.16b
+ eor v9.16b, v9.16b, v3.16b
+ eor v16.16b, v16.16b, v7.16b
+ eor v17.16b, v17.16b, v2.16b
+ and v8.16b, v8.16b, v19.16b
+ and v9.16b, v9.16b, v19.16b
+ and v16.16b, v16.16b, v19.16b
+ and v17.16b, v17.16b, v19.16b
+ eor v5.16b, v5.16b, v8.16b
+ shl v8.2d, v8.2d, #4
+ eor v3.16b, v3.16b, v9.16b
+ shl v9.2d, v9.2d, #4
+ eor v7.16b, v7.16b, v16.16b
+ shl v16.2d, v16.2d, #4
+ eor v2.16b, v2.16b, v17.16b
+ shl v17.2d, v17.2d, #4
+ eor v4.16b, v4.16b, v8.16b
+ eor v6.16b, v6.16b, v9.16b
+ eor v7.16b, v7.16b, v10.16b
+ eor v1.16b, v1.16b, v16.16b
+ eor v2.16b, v2.16b, v10.16b
+ eor v0.16b, v0.16b, v17.16b
+ eor v4.16b, v4.16b, v10.16b
+ eor v6.16b, v6.16b, v10.16b
+ eor v3.16b, v3.16b, v10.16b
+ eor v5.16b, v5.16b, v10.16b
+ eor v1.16b, v1.16b, v10.16b
+ eor v0.16b, v0.16b, v10.16b
+ ret
+.size _bsaes_decrypt8,.-_bsaes_decrypt8
+
+.section .rodata
+.type _bsaes_consts,%object
+.align 6
+_bsaes_consts:
+// InvShiftRows constants
+// Used in _bsaes_decrypt8, which assumes contiguity
+// .LM0ISR used with round 0 key
+// .LISR used with middle round keys
+// .LISRM0 used with final round key
+.LM0ISR:
+.quad 0x0a0e0206070b0f03, 0x0004080c0d010509
+.LISR:
+.quad 0x0504070602010003, 0x0f0e0d0c080b0a09
+.LISRM0:
+.quad 0x01040b0e0205080f, 0x0306090c00070a0d
+
+// ShiftRows constants
+// Used in _bsaes_encrypt8, which assumes contiguity
+// .LM0SR used with round 0 key
+// .LSR used with middle round keys
+// .LSRM0 used with final round key
+.LM0SR:
+.quad 0x0a0e02060f03070b, 0x0004080c05090d01
+.LSR:
+.quad 0x0504070600030201, 0x0f0e0d0c0a09080b
+.LSRM0:
+.quad 0x0304090e00050a0f, 0x01060b0c0207080d
+
+.LM0_bigendian:
+.quad 0x02060a0e03070b0f, 0x0004080c0105090d
+.LM0_littleendian:
+.quad 0x0105090d0004080c, 0x03070b0f02060a0e
+
+// Used in ossl_bsaes_ctr32_encrypt_blocks, prior to dropping into
+// _bsaes_encrypt8_alt, for round 0 key in place of .LM0SR
+.LREVM0SR:
+.quad 0x090d01050c000408, 0x03070b0f060a0e02
+
+.align 6
+.size _bsaes_consts,.-_bsaes_consts
+
+.previous
+
+.type _bsaes_encrypt8,%function
+.align 4
+// On entry:
+// x9 -> key (previously expanded using _bsaes_key_convert)
+// x10 = number of rounds
+// v0-v7 input data
+// On exit:
+// x9-x11 corrupted
+// other general-purpose registers preserved
+// v0-v7 output data
+// v11-v15 preserved
+// other SIMD registers corrupted
+_bsaes_encrypt8:
+ ldr q8, [x9], #16
+ adrp x11, .LM0SR
+ add x11, x11, #:lo12:.LM0SR
+ ldr q9, [x11], #16
+_bsaes_encrypt8_alt:
+ eor v0.16b, v0.16b, v8.16b
+ eor v1.16b, v1.16b, v8.16b
+ sub x10, x10, #1
+ eor v2.16b, v2.16b, v8.16b
+ eor v4.16b, v4.16b, v8.16b
+ eor v3.16b, v3.16b, v8.16b
+ eor v5.16b, v5.16b, v8.16b
+ tbl v0.16b, {v0.16b}, v9.16b
+ tbl v1.16b, {v1.16b}, v9.16b
+ tbl v2.16b, {v2.16b}, v9.16b
+ tbl v4.16b, {v4.16b}, v9.16b
+ eor v6.16b, v6.16b, v8.16b
+ eor v7.16b, v7.16b, v8.16b
+ tbl v3.16b, {v3.16b}, v9.16b
+ tbl v5.16b, {v5.16b}, v9.16b
+ tbl v6.16b, {v6.16b}, v9.16b
+ ushr v8.2d, v0.2d, #1
+ movi v10.16b, #0x55
+ tbl v7.16b, {v7.16b}, v9.16b
+ ushr v9.2d, v4.2d, #1
+ movi v16.16b, #0x33
+ ushr v17.2d, v2.2d, #1
+ eor v8.16b, v8.16b, v1.16b
+ movi v18.16b, #0x0f
+ ushr v19.2d, v6.2d, #1
+ eor v9.16b, v9.16b, v5.16b
+ eor v17.16b, v17.16b, v3.16b
+ and v8.16b, v8.16b, v10.16b
+ eor v19.16b, v19.16b, v7.16b
+ and v9.16b, v9.16b, v10.16b
+ and v17.16b, v17.16b, v10.16b
+ eor v1.16b, v1.16b, v8.16b
+ shl v8.2d, v8.2d, #1
+ and v10.16b, v19.16b, v10.16b
+ eor v5.16b, v5.16b, v9.16b
+ shl v9.2d, v9.2d, #1
+ eor v3.16b, v3.16b, v17.16b
+ shl v17.2d, v17.2d, #1
+ eor v0.16b, v0.16b, v8.16b
+ shl v8.2d, v10.2d, #1
+ eor v7.16b, v7.16b, v10.16b
+ eor v4.16b, v4.16b, v9.16b
+ eor v2.16b, v2.16b, v17.16b
+ ushr v9.2d, v1.2d, #2
+ eor v6.16b, v6.16b, v8.16b
+ ushr v8.2d, v0.2d, #2
+ ushr v10.2d, v5.2d, #2
+ ushr v17.2d, v4.2d, #2
+ eor v9.16b, v9.16b, v3.16b
+ eor v8.16b, v8.16b, v2.16b
+ eor v10.16b, v10.16b, v7.16b
+ eor v17.16b, v17.16b, v6.16b
+ and v9.16b, v9.16b, v16.16b
+ and v8.16b, v8.16b, v16.16b
+ and v10.16b, v10.16b, v16.16b
+ and v16.16b, v17.16b, v16.16b
+ eor v3.16b, v3.16b, v9.16b
+ shl v9.2d, v9.2d, #2
+ eor v2.16b, v2.16b, v8.16b
+ shl v8.2d, v8.2d, #2
+ eor v7.16b, v7.16b, v10.16b
+ shl v10.2d, v10.2d, #2
+ eor v6.16b, v6.16b, v16.16b
+ shl v16.2d, v16.2d, #2
+ eor v1.16b, v1.16b, v9.16b
+ eor v0.16b, v0.16b, v8.16b
+ eor v5.16b, v5.16b, v10.16b
+ eor v4.16b, v4.16b, v16.16b
+ ushr v8.2d, v3.2d, #4
+ ushr v9.2d, v2.2d, #4
+ ushr v10.2d, v1.2d, #4
+ ushr v16.2d, v0.2d, #4
+ eor v8.16b, v8.16b, v7.16b
+ eor v9.16b, v9.16b, v6.16b
+ eor v10.16b, v10.16b, v5.16b
+ eor v16.16b, v16.16b, v4.16b
+ and v8.16b, v8.16b, v18.16b
+ and v9.16b, v9.16b, v18.16b
+ and v10.16b, v10.16b, v18.16b
+ and v16.16b, v16.16b, v18.16b
+ eor v7.16b, v7.16b, v8.16b
+ shl v8.2d, v8.2d, #4
+ eor v6.16b, v6.16b, v9.16b
+ shl v9.2d, v9.2d, #4
+ eor v5.16b, v5.16b, v10.16b
+ shl v10.2d, v10.2d, #4
+ eor v4.16b, v4.16b, v16.16b
+ shl v16.2d, v16.2d, #4
+ eor v3.16b, v3.16b, v8.16b
+ eor v2.16b, v2.16b, v9.16b
+ eor v1.16b, v1.16b, v10.16b
+ eor v0.16b, v0.16b, v16.16b
+ b .Lenc_sbox
+.align 4
+.Lenc_loop:
+ ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x9], #64
+ ldp q8, q9, [x9], #32
+ eor v0.16b, v16.16b, v0.16b
+ ldr q10, [x9], #16
+ eor v1.16b, v17.16b, v1.16b
+ ldr q16, [x9], #16
+ eor v2.16b, v18.16b, v2.16b
+ eor v3.16b, v19.16b, v3.16b
+ eor v4.16b, v8.16b, v4.16b
+ eor v5.16b, v9.16b, v5.16b
+ eor v6.16b, v10.16b, v6.16b
+ eor v7.16b, v16.16b, v7.16b
+ tbl v0.16b, {v0.16b}, v28.16b
+ tbl v1.16b, {v1.16b}, v28.16b
+ tbl v2.16b, {v2.16b}, v28.16b
+ tbl v3.16b, {v3.16b}, v28.16b
+ tbl v4.16b, {v4.16b}, v28.16b
+ tbl v5.16b, {v5.16b}, v28.16b
+ tbl v6.16b, {v6.16b}, v28.16b
+ tbl v7.16b, {v7.16b}, v28.16b
+.Lenc_sbox:
+ eor v5.16b, v5.16b, v6.16b
+ eor v3.16b, v3.16b, v0.16b
+ subs x10, x10, #1
+ eor v2.16b, v2.16b, v1.16b
+ eor v5.16b, v5.16b, v0.16b
+ eor v8.16b, v3.16b, v7.16b
+ eor v6.16b, v6.16b, v2.16b
+ eor v7.16b, v7.16b, v5.16b
+ eor v8.16b, v8.16b, v4.16b
+ eor v3.16b, v6.16b, v3.16b
+ eor v4.16b, v4.16b, v5.16b
+ eor v6.16b, v1.16b, v5.16b
+ eor v2.16b, v2.16b, v7.16b
+ eor v1.16b, v8.16b, v1.16b
+ eor v8.16b, v7.16b, v4.16b
+ eor v9.16b, v3.16b, v0.16b
+ eor v10.16b, v7.16b, v6.16b
+ eor v16.16b, v5.16b, v3.16b
+ eor v17.16b, v6.16b, v2.16b
+ eor v18.16b, v5.16b, v1.16b
+ eor v19.16b, v2.16b, v4.16b
+ eor v20.16b, v1.16b, v0.16b
+ orr v21.16b, v8.16b, v9.16b
+ orr v22.16b, v10.16b, v16.16b
+ eor v23.16b, v8.16b, v17.16b
+ eor v24.16b, v9.16b, v18.16b
+ and v19.16b, v19.16b, v20.16b
+ orr v20.16b, v17.16b, v18.16b
+ and v8.16b, v8.16b, v9.16b
+ and v9.16b, v17.16b, v18.16b
+ and v17.16b, v23.16b, v24.16b
+ and v10.16b, v10.16b, v16.16b
+ eor v16.16b, v21.16b, v19.16b
+ eor v18.16b, v20.16b, v19.16b
+ and v19.16b, v2.16b, v1.16b
+ and v20.16b, v6.16b, v5.16b
+ eor v21.16b, v22.16b, v17.16b
+ eor v9.16b, v9.16b, v10.16b
+ eor v10.16b, v16.16b, v17.16b
+ eor v16.16b, v18.16b, v8.16b
+ and v17.16b, v4.16b, v0.16b
+ orr v18.16b, v7.16b, v3.16b
+ eor v21.16b, v21.16b, v8.16b
+ eor v8.16b, v9.16b, v8.16b
+ eor v9.16b, v10.16b, v19.16b
+ eor v10.16b, v3.16b, v0.16b
+ eor v16.16b, v16.16b, v17.16b
+ eor v17.16b, v5.16b, v1.16b
+ eor v19.16b, v21.16b, v20.16b
+ eor v20.16b, v8.16b, v18.16b
+ eor v8.16b, v8.16b, v18.16b
+ eor v18.16b, v7.16b, v4.16b
+ eor v21.16b, v9.16b, v16.16b
+ eor v22.16b, v6.16b, v2.16b
+ and v23.16b, v9.16b, v19.16b
+ eor v24.16b, v10.16b, v17.16b
+ eor v25.16b, v0.16b, v1.16b
+ eor v26.16b, v7.16b, v6.16b
+ eor v27.16b, v18.16b, v22.16b
+ eor v28.16b, v3.16b, v5.16b
+ eor v29.16b, v16.16b, v23.16b
+ eor v30.16b, v20.16b, v23.16b
+ eor v23.16b, v20.16b, v23.16b
+ eor v31.16b, v4.16b, v2.16b
+ bsl v29.16b, v19.16b, v20.16b
+ bsl v30.16b, v9.16b, v16.16b
+ bsl v8.16b, v29.16b, v23.16b
+ bsl v20.16b, v23.16b, v29.16b
+ eor v9.16b, v30.16b, v29.16b
+ and v5.16b, v5.16b, v30.16b
+ and v8.16b, v8.16b, v30.16b
+ and v1.16b, v1.16b, v29.16b
+ eor v16.16b, v19.16b, v20.16b
+ and v2.16b, v2.16b, v29.16b
+ eor v19.16b, v9.16b, v29.16b
+ and v17.16b, v17.16b, v9.16b
+ eor v8.16b, v8.16b, v21.16b
+ and v20.16b, v22.16b, v9.16b
+ eor v21.16b, v29.16b, v16.16b
+ eor v22.16b, v29.16b, v16.16b
+ and v23.16b, v25.16b, v16.16b
+ and v6.16b, v6.16b, v19.16b
+ eor v25.16b, v8.16b, v16.16b
+ eor v29.16b, v30.16b, v8.16b
+ and v4.16b, v21.16b, v4.16b
+ and v8.16b, v28.16b, v8.16b
+ and v0.16b, v22.16b, v0.16b
+ eor v21.16b, v23.16b, v1.16b
+ eor v22.16b, v9.16b, v25.16b
+ eor v9.16b, v9.16b, v25.16b
+ eor v23.16b, v25.16b, v16.16b
+ and v3.16b, v29.16b, v3.16b
+ and v24.16b, v24.16b, v25.16b
+ and v25.16b, v27.16b, v25.16b
+ and v10.16b, v22.16b, v10.16b
+ and v9.16b, v9.16b, v18.16b
+ eor v18.16b, v19.16b, v23.16b
+ and v19.16b, v26.16b, v23.16b
+ eor v3.16b, v5.16b, v3.16b
+ eor v17.16b, v17.16b, v24.16b
+ eor v10.16b, v24.16b, v10.16b
+ and v16.16b, v31.16b, v16.16b
+ eor v20.16b, v20.16b, v25.16b
+ eor v9.16b, v25.16b, v9.16b
+ eor v4.16b, v2.16b, v4.16b
+ and v7.16b, v18.16b, v7.16b
+ eor v18.16b, v19.16b, v6.16b
+ eor v5.16b, v8.16b, v5.16b
+ eor v0.16b, v1.16b, v0.16b
+ eor v1.16b, v21.16b, v10.16b
+ eor v8.16b, v3.16b, v17.16b
+ eor v2.16b, v16.16b, v2.16b
+ eor v3.16b, v6.16b, v7.16b
+ eor v6.16b, v18.16b, v9.16b
+ eor v4.16b, v4.16b, v20.16b
+ eor v10.16b, v5.16b, v10.16b
+ eor v0.16b, v0.16b, v17.16b
+ eor v9.16b, v2.16b, v9.16b
+ eor v3.16b, v3.16b, v20.16b
+ eor v7.16b, v6.16b, v1.16b
+ eor v5.16b, v8.16b, v4.16b
+ eor v6.16b, v10.16b, v1.16b
+ eor v2.16b, v4.16b, v0.16b
+ eor v4.16b, v3.16b, v10.16b
+ eor v9.16b, v9.16b, v7.16b
+ eor v3.16b, v0.16b, v5.16b
+ eor v0.16b, v1.16b, v4.16b
+ eor v1.16b, v4.16b, v8.16b
+ eor v4.16b, v9.16b, v5.16b
+ eor v6.16b, v6.16b, v3.16b
+ bcc .Lenc_done
+ ext v8.16b, v0.16b, v0.16b, #12
+ ext v9.16b, v4.16b, v4.16b, #12
+ ldr q28, [x11]
+ ext v10.16b, v6.16b, v6.16b, #12
+ ext v16.16b, v1.16b, v1.16b, #12
+ ext v17.16b, v3.16b, v3.16b, #12
+ ext v18.16b, v7.16b, v7.16b, #12
+ eor v0.16b, v0.16b, v8.16b
+ eor v4.16b, v4.16b, v9.16b
+ eor v6.16b, v6.16b, v10.16b
+ ext v19.16b, v2.16b, v2.16b, #12
+ ext v20.16b, v5.16b, v5.16b, #12
+ eor v1.16b, v1.16b, v16.16b
+ eor v3.16b, v3.16b, v17.16b
+ eor v7.16b, v7.16b, v18.16b
+ eor v2.16b, v2.16b, v19.16b
+ eor v16.16b, v16.16b, v0.16b
+ eor v5.16b, v5.16b, v20.16b
+ eor v17.16b, v17.16b, v6.16b
+ eor v10.16b, v10.16b, v4.16b
+ ext v0.16b, v0.16b, v0.16b, #8
+ eor v9.16b, v9.16b, v1.16b
+ ext v1.16b, v1.16b, v1.16b, #8
+ eor v8.16b, v8.16b, v5.16b
+ eor v16.16b, v16.16b, v5.16b
+ eor v18.16b, v18.16b, v3.16b
+ eor v19.16b, v19.16b, v7.16b
+ ext v3.16b, v3.16b, v3.16b, #8
+ ext v7.16b, v7.16b, v7.16b, #8
+ eor v20.16b, v20.16b, v2.16b
+ ext v6.16b, v6.16b, v6.16b, #8
+ ext v21.16b, v5.16b, v5.16b, #8
+ eor v17.16b, v17.16b, v5.16b
+ ext v2.16b, v2.16b, v2.16b, #8
+ eor v10.16b, v10.16b, v5.16b
+ ext v22.16b, v4.16b, v4.16b, #8
+ eor v0.16b, v0.16b, v8.16b
+ eor v1.16b, v1.16b, v16.16b
+ eor v5.16b, v7.16b, v18.16b
+ eor v4.16b, v3.16b, v17.16b
+ eor v3.16b, v6.16b, v10.16b
+ eor v7.16b, v21.16b, v20.16b
+ eor v6.16b, v2.16b, v19.16b
+ eor v2.16b, v22.16b, v9.16b
+ bne .Lenc_loop
+ ldr q28, [x11, #16]! // load from .LSRM0 on last round (x10 == 0)
+ b .Lenc_loop
+.align 4
+.Lenc_done:
+ ushr v8.2d, v0.2d, #1
+ movi v9.16b, #0x55
+ ldr q10, [x9]
+ ushr v16.2d, v3.2d, #1
+ movi v17.16b, #0x33
+ ushr v18.2d, v4.2d, #1
+ movi v19.16b, #0x0f
+ eor v8.16b, v8.16b, v1.16b
+ ushr v20.2d, v2.2d, #1
+ eor v16.16b, v16.16b, v7.16b
+ eor v18.16b, v18.16b, v6.16b
+ and v8.16b, v8.16b, v9.16b
+ eor v20.16b, v20.16b, v5.16b
+ and v16.16b, v16.16b, v9.16b
+ and v18.16b, v18.16b, v9.16b
+ shl v21.2d, v8.2d, #1
+ eor v1.16b, v1.16b, v8.16b
+ and v8.16b, v20.16b, v9.16b
+ eor v7.16b, v7.16b, v16.16b
+ shl v9.2d, v16.2d, #1
+ eor v6.16b, v6.16b, v18.16b
+ shl v16.2d, v18.2d, #1
+ eor v0.16b, v0.16b, v21.16b
+ shl v18.2d, v8.2d, #1
+ eor v5.16b, v5.16b, v8.16b
+ eor v3.16b, v3.16b, v9.16b
+ eor v4.16b, v4.16b, v16.16b
+ ushr v8.2d, v1.2d, #2
+ eor v2.16b, v2.16b, v18.16b
+ ushr v9.2d, v0.2d, #2
+ ushr v16.2d, v7.2d, #2
+ ushr v18.2d, v3.2d, #2
+ eor v8.16b, v8.16b, v6.16b
+ eor v9.16b, v9.16b, v4.16b
+ eor v16.16b, v16.16b, v5.16b
+ eor v18.16b, v18.16b, v2.16b
+ and v8.16b, v8.16b, v17.16b
+ and v9.16b, v9.16b, v17.16b
+ and v16.16b, v16.16b, v17.16b
+ and v17.16b, v18.16b, v17.16b
+ eor v6.16b, v6.16b, v8.16b
+ shl v8.2d, v8.2d, #2
+ eor v4.16b, v4.16b, v9.16b
+ shl v9.2d, v9.2d, #2
+ eor v5.16b, v5.16b, v16.16b
+ shl v16.2d, v16.2d, #2
+ eor v2.16b, v2.16b, v17.16b
+ shl v17.2d, v17.2d, #2
+ eor v1.16b, v1.16b, v8.16b
+ eor v0.16b, v0.16b, v9.16b
+ eor v7.16b, v7.16b, v16.16b
+ eor v3.16b, v3.16b, v17.16b
+ ushr v8.2d, v6.2d, #4
+ ushr v9.2d, v4.2d, #4
+ ushr v16.2d, v1.2d, #4
+ ushr v17.2d, v0.2d, #4
+ eor v8.16b, v8.16b, v5.16b
+ eor v9.16b, v9.16b, v2.16b
+ eor v16.16b, v16.16b, v7.16b
+ eor v17.16b, v17.16b, v3.16b
+ and v8.16b, v8.16b, v19.16b
+ and v9.16b, v9.16b, v19.16b
+ and v16.16b, v16.16b, v19.16b
+ and v17.16b, v17.16b, v19.16b
+ eor v5.16b, v5.16b, v8.16b
+ shl v8.2d, v8.2d, #4
+ eor v2.16b, v2.16b, v9.16b
+ shl v9.2d, v9.2d, #4
+ eor v7.16b, v7.16b, v16.16b
+ shl v16.2d, v16.2d, #4
+ eor v3.16b, v3.16b, v17.16b
+ shl v17.2d, v17.2d, #4
+ eor v6.16b, v6.16b, v8.16b
+ eor v4.16b, v4.16b, v9.16b
+ eor v7.16b, v7.16b, v10.16b
+ eor v1.16b, v1.16b, v16.16b
+ eor v3.16b, v3.16b, v10.16b
+ eor v0.16b, v0.16b, v17.16b
+ eor v6.16b, v6.16b, v10.16b
+ eor v4.16b, v4.16b, v10.16b
+ eor v2.16b, v2.16b, v10.16b
+ eor v5.16b, v5.16b, v10.16b
+ eor v1.16b, v1.16b, v10.16b
+ eor v0.16b, v0.16b, v10.16b
+ ret
+.size _bsaes_encrypt8,.-_bsaes_encrypt8
+
+.type _bsaes_key_convert,%function
+.align 4
+// On entry:
+// x9 -> input key (big-endian)
+// x10 = number of rounds
+// x17 -> output key (native endianness)
+// On exit:
+// x9, x10 corrupted
+// x11 -> .LM0_bigendian
+// x17 -> last quadword of output key
+// other general-purpose registers preserved
+// v2-v6 preserved
+// v7.16b[] = 0x63
+// v8-v14 preserved
+// v15 = last round key (converted to native endianness)
+// other SIMD registers corrupted
+_bsaes_key_convert:
+#ifdef __AARCH64EL__
+ adrp x11, .LM0_littleendian
+ add x11, x11, #:lo12:.LM0_littleendian
+#else
+ adrp x11, .LM0_bigendian
+ add x11, x11, #:lo12:.LM0_bigendian
+#endif
+ ldr q0, [x9], #16 // load round 0 key
+ ldr q1, [x11] // .LM0
+ ldr q15, [x9], #16 // load round 1 key
+
+ movi v7.16b, #0x63 // compose .L63
+ movi v16.16b, #0x01 // bit masks
+ movi v17.16b, #0x02
+ movi v18.16b, #0x04
+ movi v19.16b, #0x08
+ movi v20.16b, #0x10
+ movi v21.16b, #0x20
+ movi v22.16b, #0x40
+ movi v23.16b, #0x80
+
+#ifdef __AARCH64EL__
+ rev32 v0.16b, v0.16b
+#endif
+ sub x10, x10, #1
+ str q0, [x17], #16 // save round 0 key
+
+.align 4
+.Lkey_loop:
+ tbl v0.16b, {v15.16b}, v1.16b
+ ldr q15, [x9], #16 // load next round key
+
+ eor v0.16b, v0.16b, v7.16b
+ cmtst v24.16b, v0.16b, v16.16b
+ cmtst v25.16b, v0.16b, v17.16b
+ cmtst v26.16b, v0.16b, v18.16b
+ cmtst v27.16b, v0.16b, v19.16b
+ cmtst v28.16b, v0.16b, v20.16b
+ cmtst v29.16b, v0.16b, v21.16b
+ cmtst v30.16b, v0.16b, v22.16b
+ cmtst v31.16b, v0.16b, v23.16b
+ sub x10, x10, #1
+ st1 {v24.16b,v25.16b,v26.16b,v27.16b}, [x17], #64 // write bit-sliced round key
+ st1 {v28.16b,v29.16b,v30.16b,v31.16b}, [x17], #64
+ cbnz x10, .Lkey_loop
+
+ // don't save last round key
+#ifdef __AARCH64EL__
+ rev32 v15.16b, v15.16b
+ adrp x11, .LM0_bigendian
+ add x11, x11, #:lo12:.LM0_bigendian
+#endif
+ ret
+.size _bsaes_key_convert,.-_bsaes_key_convert
+
+.globl ossl_bsaes_cbc_encrypt
+.type ossl_bsaes_cbc_encrypt,%function
+.align 4
+// On entry:
+// x0 -> input ciphertext
+// x1 -> output plaintext
+// x2 = size of ciphertext and plaintext in bytes (assumed a multiple of 16)
+// x3 -> key
+// x4 -> 128-bit initialisation vector (or preceding 128-bit block of ciphertext if continuing after an earlier call)
+// w5 must be == 0
+// On exit:
+// Output plaintext filled in
+// Initialisation vector overwritten with last quadword of ciphertext
+// No output registers, usual AAPCS64 register preservation
+ossl_bsaes_cbc_encrypt:
+ AARCH64_VALID_CALL_TARGET
+ cmp x2, #128
+ bhs .Lcbc_do_bsaes
+ b AES_cbc_encrypt
+.Lcbc_do_bsaes:
+
+ // it is up to the caller to make sure we are called with enc == 0
+
+ stp x29, x30, [sp, #-48]!
+ stp d8, d9, [sp, #16]
+ stp d10, d15, [sp, #32]
+ lsr x2, x2, #4 // len in 16 byte blocks
+
+ ldr w15, [x3, #240] // get # of rounds
+ mov x14, sp
+
+ // allocate the key schedule on the stack
+ add x17, sp, #96
+ sub x17, x17, x15, lsl #7 // 128 bytes per inner round key, less 96 bytes
+
+ // populate the key schedule
+ mov x9, x3 // pass key
+ mov x10, x15 // pass # of rounds
+ mov sp, x17 // sp is sp
+ bl _bsaes_key_convert
+ ldr q6, [sp]
+ str q15, [x17] // save last round key
+ eor v6.16b, v6.16b, v7.16b // fix up round 0 key (by XORing with 0x63)
+ str q6, [sp]
+
+ ldr q15, [x4] // load IV
+ b .Lcbc_dec_loop
+
+.align 4
+.Lcbc_dec_loop:
+ subs x2, x2, #0x8
+ bmi .Lcbc_dec_loop_finish
+
+ ldr q0, [x0], #16 // load input
+ mov x9, sp // pass the key
+ ldr q1, [x0], #16
+ mov x10, x15
+ ldr q2, [x0], #16
+ ldr q3, [x0], #16
+ ldr q4, [x0], #16
+ ldr q5, [x0], #16
+ ldr q6, [x0], #16
+ ldr q7, [x0], #-7*16
+
+ bl _bsaes_decrypt8
+
+ ldr q16, [x0], #16 // reload input
+ eor v0.16b, v0.16b, v15.16b // ^= IV
+ eor v1.16b, v1.16b, v16.16b
+ str q0, [x1], #16 // write output
+ ldr q0, [x0], #16
+ str q1, [x1], #16
+ ldr q1, [x0], #16
+ eor v1.16b, v4.16b, v1.16b
+ ldr q4, [x0], #16
+ eor v2.16b, v2.16b, v4.16b
+ eor v0.16b, v6.16b, v0.16b
+ ldr q4, [x0], #16
+ str q0, [x1], #16
+ str q1, [x1], #16
+ eor v0.16b, v7.16b, v4.16b
+ ldr q1, [x0], #16
+ str q2, [x1], #16
+ ldr q2, [x0], #16
+ ldr q15, [x0], #16
+ str q0, [x1], #16
+ eor v0.16b, v5.16b, v2.16b
+ eor v1.16b, v3.16b, v1.16b
+ str q1, [x1], #16
+ str q0, [x1], #16
+
+ b .Lcbc_dec_loop
+
+.Lcbc_dec_loop_finish:
+ adds x2, x2, #8
+ beq .Lcbc_dec_done
+
+ ldr q0, [x0], #16 // load input
+ cmp x2, #2
+ blo .Lcbc_dec_one
+ ldr q1, [x0], #16
+ mov x9, sp // pass the key
+ mov x10, x15
+ beq .Lcbc_dec_two
+ ldr q2, [x0], #16
+ cmp x2, #4
+ blo .Lcbc_dec_three
+ ldr q3, [x0], #16
+ beq .Lcbc_dec_four
+ ldr q4, [x0], #16
+ cmp x2, #6
+ blo .Lcbc_dec_five
+ ldr q5, [x0], #16
+ beq .Lcbc_dec_six
+ ldr q6, [x0], #-6*16
+
+ bl _bsaes_decrypt8
+
+ ldr q5, [x0], #16 // reload input
+ eor v0.16b, v0.16b, v15.16b // ^= IV
+ ldr q8, [x0], #16
+ ldr q9, [x0], #16
+ ldr q10, [x0], #16
+ str q0, [x1], #16 // write output
+ ldr q0, [x0], #16
+ eor v1.16b, v1.16b, v5.16b
+ ldr q5, [x0], #16
+ eor v6.16b, v6.16b, v8.16b
+ ldr q15, [x0]
+ eor v4.16b, v4.16b, v9.16b
+ eor v2.16b, v2.16b, v10.16b
+ str q1, [x1], #16
+ eor v0.16b, v7.16b, v0.16b
+ str q6, [x1], #16
+ eor v1.16b, v3.16b, v5.16b
+ str q4, [x1], #16
+ str q2, [x1], #16
+ str q0, [x1], #16
+ str q1, [x1]
+ b .Lcbc_dec_done
+.align 4
+.Lcbc_dec_six:
+ sub x0, x0, #0x60
+ bl _bsaes_decrypt8
+ ldr q3, [x0], #16 // reload input
+ eor v0.16b, v0.16b, v15.16b // ^= IV
+ ldr q5, [x0], #16
+ ldr q8, [x0], #16
+ ldr q9, [x0], #16
+ str q0, [x1], #16 // write output
+ ldr q0, [x0], #16
+ eor v1.16b, v1.16b, v3.16b
+ ldr q15, [x0]
+ eor v3.16b, v6.16b, v5.16b
+ eor v4.16b, v4.16b, v8.16b
+ eor v2.16b, v2.16b, v9.16b
+ str q1, [x1], #16
+ eor v0.16b, v7.16b, v0.16b
+ str q3, [x1], #16
+ str q4, [x1], #16
+ str q2, [x1], #16
+ str q0, [x1]
+ b .Lcbc_dec_done
+.align 4
+.Lcbc_dec_five:
+ sub x0, x0, #0x50
+ bl _bsaes_decrypt8
+ ldr q3, [x0], #16 // reload input
+ eor v0.16b, v0.16b, v15.16b // ^= IV
+ ldr q5, [x0], #16
+ ldr q7, [x0], #16
+ ldr q8, [x0], #16
+ str q0, [x1], #16 // write output
+ ldr q15, [x0]
+ eor v0.16b, v1.16b, v3.16b
+ eor v1.16b, v6.16b, v5.16b
+ eor v3.16b, v4.16b, v7.16b
+ str q0, [x1], #16
+ eor v0.16b, v2.16b, v8.16b
+ str q1, [x1], #16
+ str q3, [x1], #16
+ str q0, [x1]
+ b .Lcbc_dec_done
+.align 4
+.Lcbc_dec_four:
+ sub x0, x0, #0x40
+ bl _bsaes_decrypt8
+ ldr q2, [x0], #16 // reload input
+ eor v0.16b, v0.16b, v15.16b // ^= IV
+ ldr q3, [x0], #16
+ ldr q5, [x0], #16
+ str q0, [x1], #16 // write output
+ ldr q15, [x0]
+ eor v0.16b, v1.16b, v2.16b
+ eor v1.16b, v6.16b, v3.16b
+ eor v2.16b, v4.16b, v5.16b
+ str q0, [x1], #16
+ str q1, [x1], #16
+ str q2, [x1]
+ b .Lcbc_dec_done
+.align 4
+.Lcbc_dec_three:
+ sub x0, x0, #0x30
+ bl _bsaes_decrypt8
+ ldr q2, [x0], #16 // reload input
+ eor v0.16b, v0.16b, v15.16b // ^= IV
+ ldr q3, [x0], #16
+ ldr q15, [x0]
+ str q0, [x1], #16 // write output
+ eor v0.16b, v1.16b, v2.16b
+ eor v1.16b, v6.16b, v3.16b
+ str q0, [x1], #16
+ str q1, [x1]
+ b .Lcbc_dec_done
+.align 4
+.Lcbc_dec_two:
+ sub x0, x0, #0x20
+ bl _bsaes_decrypt8
+ ldr q2, [x0], #16 // reload input
+ eor v0.16b, v0.16b, v15.16b // ^= IV
+ ldr q15, [x0]
+ str q0, [x1], #16 // write output
+ eor v0.16b, v1.16b, v2.16b
+ str q0, [x1]
+ b .Lcbc_dec_done
+.align 4
+.Lcbc_dec_one:
+ sub x0, x0, #0x10
+ stp x1, x4, [sp, #-32]!
+ str x14, [sp, #16]
+ mov v8.16b, v15.16b
+ mov v15.16b, v0.16b
+ mov x2, x3
+ bl AES_decrypt
+ ldr x14, [sp, #16]
+ ldp x1, x4, [sp], #32
+ ldr q0, [x1] // load result
+ eor v0.16b, v0.16b, v8.16b // ^= IV
+ str q0, [x1] // write output
+
+.align 4
+.Lcbc_dec_done:
+ movi v0.16b, #0
+ movi v1.16b, #0
+.Lcbc_dec_bzero: // wipe key schedule [if any]
+ stp q0, q1, [sp], #32
+ cmp sp, x14
+ bne .Lcbc_dec_bzero
+ str q15, [x4] // return IV
+ ldp d8, d9, [sp, #16]
+ ldp d10, d15, [sp, #32]
+ ldp x29, x30, [sp], #48
+ ret
+.size ossl_bsaes_cbc_encrypt,.-ossl_bsaes_cbc_encrypt
+
+.globl ossl_bsaes_ctr32_encrypt_blocks
+.type ossl_bsaes_ctr32_encrypt_blocks,%function
+.align 4
+// On entry:
+// x0 -> input text (whole 16-byte blocks)
+// x1 -> output text (whole 16-byte blocks)
+// x2 = number of 16-byte blocks to encrypt/decrypt (> 0)
+// x3 -> key
+// x4 -> initial value of 128-bit counter (stored big-endian) which increments, modulo 2^32, for each block
+// On exit:
+// Output text filled in
+// No output registers, usual AAPCS64 register preservation
+ossl_bsaes_ctr32_encrypt_blocks:
+ AARCH64_VALID_CALL_TARGET
+ cmp x2, #8 // use plain AES for
+ blo .Lctr_enc_short // small sizes
+
+ stp x29, x30, [sp, #-80]!
+ stp d8, d9, [sp, #16]
+ stp d10, d11, [sp, #32]
+ stp d12, d13, [sp, #48]
+ stp d14, d15, [sp, #64]
+
+ ldr w15, [x3, #240] // get # of rounds
+ mov x14, sp
+
+ // allocate the key schedule on the stack
+ add x17, sp, #96
+ sub x17, x17, x15, lsl #7 // 128 bytes per inner round key, less 96 bytes
+
+ // populate the key schedule
+ mov x9, x3 // pass key
+ mov x10, x15 // pass # of rounds
+ mov sp, x17 // sp is sp
+ bl _bsaes_key_convert
+ eor v7.16b, v7.16b, v15.16b // fix up last round key
+ str q7, [x17] // save last round key
+
+ ldr q0, [x4] // load counter
+ add x13, x11, #.LREVM0SR-.LM0_bigendian
+ ldr q4, [sp] // load round0 key
+
+ movi v8.4s, #1 // compose 1<<96
+ movi v9.16b, #0
+ rev32 v15.16b, v0.16b
+ rev32 v0.16b, v0.16b
+ ext v11.16b, v9.16b, v8.16b, #4
+ rev32 v4.16b, v4.16b
+ add v12.4s, v11.4s, v11.4s // compose 2<<96
+ str q4, [sp] // save adjusted round0 key
+ add v13.4s, v11.4s, v12.4s // compose 3<<96
+ add v14.4s, v12.4s, v12.4s // compose 4<<96
+ b .Lctr_enc_loop
+
+.align 4
+.Lctr_enc_loop:
+ // Intermix prologue from _bsaes_encrypt8 to use the opportunity
+ // to flip byte order in 32-bit counter
+
+ add v1.4s, v15.4s, v11.4s // +1
+ add x9, sp, #0x10 // pass next round key
+ add v2.4s, v15.4s, v12.4s // +2
+ ldr q9, [x13] // .LREVM0SR
+ ldr q8, [sp] // load round0 key
+ add v3.4s, v15.4s, v13.4s // +3
+ mov x10, x15 // pass rounds
+ sub x11, x13, #.LREVM0SR-.LSR // pass constants
+ add v6.4s, v2.4s, v14.4s
+ add v4.4s, v15.4s, v14.4s // +4
+ add v7.4s, v3.4s, v14.4s
+ add v15.4s, v4.4s, v14.4s // next counter
+ add v5.4s, v1.4s, v14.4s
+
+ bl _bsaes_encrypt8_alt
+
+ subs x2, x2, #8
+ blo .Lctr_enc_loop_done
+
+ ldr q16, [x0], #16
+ ldr q17, [x0], #16
+ eor v1.16b, v1.16b, v17.16b
+ ldr q17, [x0], #16
+ eor v0.16b, v0.16b, v16.16b
+ eor v4.16b, v4.16b, v17.16b
+ str q0, [x1], #16
+ ldr q16, [x0], #16
+ str q1, [x1], #16
+ mov v0.16b, v15.16b
+ str q4, [x1], #16
+ ldr q1, [x0], #16
+ eor v4.16b, v6.16b, v16.16b
+ eor v1.16b, v3.16b, v1.16b
+ ldr q3, [x0], #16
+ eor v3.16b, v7.16b, v3.16b
+ ldr q6, [x0], #16
+ eor v2.16b, v2.16b, v6.16b
+ ldr q6, [x0], #16
+ eor v5.16b, v5.16b, v6.16b
+ str q4, [x1], #16
+ str q1, [x1], #16
+ str q3, [x1], #16
+ str q2, [x1], #16
+ str q5, [x1], #16
+
+ bne .Lctr_enc_loop
+ b .Lctr_enc_done
+
+.align 4
+.Lctr_enc_loop_done:
+ add x2, x2, #8
+ ldr q16, [x0], #16 // load input
+ eor v0.16b, v0.16b, v16.16b
+ str q0, [x1], #16 // write output
+ cmp x2, #2
+ blo .Lctr_enc_done
+ ldr q17, [x0], #16
+ eor v1.16b, v1.16b, v17.16b
+ str q1, [x1], #16
+ beq .Lctr_enc_done
+ ldr q18, [x0], #16
+ eor v4.16b, v4.16b, v18.16b
+ str q4, [x1], #16
+ cmp x2, #4
+ blo .Lctr_enc_done
+ ldr q19, [x0], #16
+ eor v6.16b, v6.16b, v19.16b
+ str q6, [x1], #16
+ beq .Lctr_enc_done
+ ldr q20, [x0], #16
+ eor v3.16b, v3.16b, v20.16b
+ str q3, [x1], #16
+ cmp x2, #6
+ blo .Lctr_enc_done
+ ldr q21, [x0], #16
+ eor v7.16b, v7.16b, v21.16b
+ str q7, [x1], #16
+ beq .Lctr_enc_done
+ ldr q22, [x0]
+ eor v2.16b, v2.16b, v22.16b
+ str q2, [x1], #16
+
+.Lctr_enc_done:
+ movi v0.16b, #0
+ movi v1.16b, #0
+.Lctr_enc_bzero: // wipe key schedule [if any]
+ stp q0, q1, [sp], #32
+ cmp sp, x14
+ bne .Lctr_enc_bzero
+
+ ldp d8, d9, [sp, #16]
+ ldp d10, d11, [sp, #32]
+ ldp d12, d13, [sp, #48]
+ ldp d14, d15, [sp, #64]
+ ldp x29, x30, [sp], #80
+ ret
+
+.Lctr_enc_short:
+ stp x29, x30, [sp, #-96]!
+ stp x19, x20, [sp, #16]
+ stp x21, x22, [sp, #32]
+ str x23, [sp, #48]
+
+ mov x19, x0 // copy arguments
+ mov x20, x1
+ mov x21, x2
+ mov x22, x3
+ ldr w23, [x4, #12] // load counter .LSW
+ ldr q1, [x4] // load whole counter value
+#ifdef __AARCH64EL__
+ rev w23, w23
+#endif
+ str q1, [sp, #80] // copy counter value
+
+.Lctr_enc_short_loop:
+ add x0, sp, #80 // input counter value
+ add x1, sp, #64 // output on the stack
+ mov x2, x22 // key
+
+ bl AES_encrypt
+
+ ldr q0, [x19], #16 // load input
+ ldr q1, [sp, #64] // load encrypted counter
+ add x23, x23, #1
+#ifdef __AARCH64EL__
+ rev w0, w23
+ str w0, [sp, #80+12] // next counter value
+#else
+ str w23, [sp, #80+12] // next counter value
+#endif
+ eor v0.16b, v0.16b, v1.16b
+ str q0, [x20], #16 // store output
+ subs x21, x21, #1
+ bne .Lctr_enc_short_loop
+
+ movi v0.16b, #0
+ movi v1.16b, #0
+ stp q0, q1, [sp, #64]
+
+ ldr x23, [sp, #48]
+ ldp x21, x22, [sp, #32]
+ ldp x19, x20, [sp, #16]
+ ldp x29, x30, [sp], #96
+ ret
+.size ossl_bsaes_ctr32_encrypt_blocks,.-ossl_bsaes_ctr32_encrypt_blocks
+
+.globl ossl_bsaes_xts_encrypt
+.type ossl_bsaes_xts_encrypt,%function
+.align 4
+// On entry:
+// x0 -> input plaintext
+// x1 -> output ciphertext
+// x2 -> length of text in bytes (must be at least 16)
+// x3 -> key1 (used to encrypt the XORed plaintext blocks)
+// x4 -> key2 (used to encrypt the initial vector to yield the initial tweak)
+// x5 -> 16-byte initial vector (typically, sector number)
+// On exit:
+// Output ciphertext filled in
+// No output registers, usual AAPCS64 register preservation
+ossl_bsaes_xts_encrypt:
+ AARCH64_VALID_CALL_TARGET
+ // Stack layout:
+ // sp ->
+ // nrounds*128-96 bytes: key schedule
+ // x19 ->
+ // 16 bytes: frame record
+ // 4*16 bytes: tweak storage across _bsaes_encrypt8
+ // 6*8 bytes: storage for 5 callee-saved general-purpose registers
+ // 8*8 bytes: storage for 8 callee-saved SIMD registers
+ stp x29, x30, [sp, #-192]!
+ stp x19, x20, [sp, #80]
+ stp x21, x22, [sp, #96]
+ str x23, [sp, #112]
+ stp d8, d9, [sp, #128]
+ stp d10, d11, [sp, #144]
+ stp d12, d13, [sp, #160]
+ stp d14, d15, [sp, #176]
+
+ mov x19, sp
+ mov x20, x0
+ mov x21, x1
+ mov x22, x2
+ mov x23, x3
+
+ // generate initial tweak
+ sub sp, sp, #16
+ mov x0, x5 // iv[]
+ mov x1, sp
+ mov x2, x4 // key2
+ bl AES_encrypt
+ ldr q11, [sp], #16
+
+ ldr w1, [x23, #240] // get # of rounds
+ // allocate the key schedule on the stack
+ add x17, sp, #96
+ sub x17, x17, x1, lsl #7 // 128 bytes per inner round key, less 96 bytes
+
+ // populate the key schedule
+ mov x9, x23 // pass key
+ mov x10, x1 // pass # of rounds
+ mov sp, x17
+ bl _bsaes_key_convert
+ eor v15.16b, v15.16b, v7.16b // fix up last round key
+ str q15, [x17] // save last round key
+
+ subs x22, x22, #0x80
+ blo .Lxts_enc_short
+ b .Lxts_enc_loop
+
+.align 4
+.Lxts_enc_loop:
+ ldr q8, .Lxts_magic
+ mov x10, x1 // pass rounds
+ add x2, x19, #16
+ ldr q0, [x20], #16
+ sshr v1.2d, v11.2d, #63
+ mov x9, sp // pass key schedule
+ ldr q6, .Lxts_magic+16
+ add v2.2d, v11.2d, v11.2d
+ cmtst v3.2d, v11.2d, v6.2d
+ and v1.16b, v1.16b, v8.16b
+ ext v1.16b, v1.16b, v1.16b, #8
+ and v3.16b, v3.16b, v8.16b
+ ldr q4, [x20], #16
+ eor v12.16b, v2.16b, v1.16b
+ eor v1.16b, v4.16b, v12.16b
+ eor v0.16b, v0.16b, v11.16b
+ cmtst v2.2d, v12.2d, v6.2d
+ add v4.2d, v12.2d, v12.2d
+ add x0, x19, #16
+ ext v3.16b, v3.16b, v3.16b, #8
+ and v2.16b, v2.16b, v8.16b
+ eor v13.16b, v4.16b, v3.16b
+ ldr q3, [x20], #16
+ ext v4.16b, v2.16b, v2.16b, #8
+ eor v2.16b, v3.16b, v13.16b
+ ldr q3, [x20], #16
+ add v5.2d, v13.2d, v13.2d
+ cmtst v7.2d, v13.2d, v6.2d
+ and v7.16b, v7.16b, v8.16b
+ ldr q9, [x20], #16
+ ext v7.16b, v7.16b, v7.16b, #8
+ ldr q10, [x20], #16
+ eor v14.16b, v5.16b, v4.16b
+ ldr q16, [x20], #16
+ add v4.2d, v14.2d, v14.2d
+ eor v3.16b, v3.16b, v14.16b
+ eor v15.16b, v4.16b, v7.16b
+ add v5.2d, v15.2d, v15.2d
+ ldr q7, [x20], #16
+ cmtst v4.2d, v14.2d, v6.2d
+ and v17.16b, v4.16b, v8.16b
+ cmtst v18.2d, v15.2d, v6.2d
+ eor v4.16b, v9.16b, v15.16b
+ ext v9.16b, v17.16b, v17.16b, #8
+ eor v9.16b, v5.16b, v9.16b
+ add v17.2d, v9.2d, v9.2d
+ and v18.16b, v18.16b, v8.16b
+ eor v5.16b, v10.16b, v9.16b
+ str q9, [x2], #16
+ ext v10.16b, v18.16b, v18.16b, #8
+ cmtst v9.2d, v9.2d, v6.2d
+ and v9.16b, v9.16b, v8.16b
+ eor v10.16b, v17.16b, v10.16b
+ cmtst v17.2d, v10.2d, v6.2d
+ eor v6.16b, v16.16b, v10.16b
+ str q10, [x2], #16
+ ext v9.16b, v9.16b, v9.16b, #8
+ add v10.2d, v10.2d, v10.2d
+ eor v9.16b, v10.16b, v9.16b
+ str q9, [x2], #16
+ eor v7.16b, v7.16b, v9.16b
+ add v9.2d, v9.2d, v9.2d
+ and v8.16b, v17.16b, v8.16b
+ ext v8.16b, v8.16b, v8.16b, #8
+ eor v8.16b, v9.16b, v8.16b
+ str q8, [x2] // next round tweak
+
+ bl _bsaes_encrypt8
+
+ ldr q8, [x0], #16
+ eor v0.16b, v0.16b, v11.16b
+ eor v1.16b, v1.16b, v12.16b
+ ldr q9, [x0], #16
+ eor v4.16b, v4.16b, v13.16b
+ eor v6.16b, v6.16b, v14.16b
+ ldr q10, [x0], #16
+ eor v3.16b, v3.16b, v15.16b
+ subs x22, x22, #0x80
+ str q0, [x21], #16
+ ldr q11, [x0] // next round tweak
+ str q1, [x21], #16
+ eor v0.16b, v7.16b, v8.16b
+ eor v1.16b, v2.16b, v9.16b
+ str q4, [x21], #16
+ eor v2.16b, v5.16b, v10.16b
+ str q6, [x21], #16
+ str q3, [x21], #16
+ str q0, [x21], #16
+ str q1, [x21], #16
+ str q2, [x21], #16
+ bpl .Lxts_enc_loop
+
+.Lxts_enc_short:
+ adds x22, x22, #0x70
+ bmi .Lxts_enc_done
+
+ ldr q8, .Lxts_magic
+ sshr v1.2d, v11.2d, #63
+ add v2.2d, v11.2d, v11.2d
+ ldr q9, .Lxts_magic+16
+ subs x22, x22, #0x10
+ ldr q0, [x20], #16
+ and v1.16b, v1.16b, v8.16b
+ cmtst v3.2d, v11.2d, v9.2d
+ ext v1.16b, v1.16b, v1.16b, #8
+ and v3.16b, v3.16b, v8.16b
+ eor v12.16b, v2.16b, v1.16b
+ ext v1.16b, v3.16b, v3.16b, #8
+ add v2.2d, v12.2d, v12.2d
+ cmtst v3.2d, v12.2d, v9.2d
+ eor v13.16b, v2.16b, v1.16b
+ and v22.16b, v3.16b, v8.16b
+ bmi .Lxts_enc_1
+
+ ext v2.16b, v22.16b, v22.16b, #8
+ add v3.2d, v13.2d, v13.2d
+ ldr q1, [x20], #16
+ cmtst v4.2d, v13.2d, v9.2d
+ subs x22, x22, #0x10
+ eor v14.16b, v3.16b, v2.16b
+ and v23.16b, v4.16b, v8.16b
+ bmi .Lxts_enc_2
+
+ ext v3.16b, v23.16b, v23.16b, #8
+ add v4.2d, v14.2d, v14.2d
+ ldr q2, [x20], #16
+ cmtst v5.2d, v14.2d, v9.2d
+ eor v0.16b, v0.16b, v11.16b
+ subs x22, x22, #0x10
+ eor v15.16b, v4.16b, v3.16b
+ and v24.16b, v5.16b, v8.16b
+ bmi .Lxts_enc_3
+
+ ext v4.16b, v24.16b, v24.16b, #8
+ add v5.2d, v15.2d, v15.2d
+ ldr q3, [x20], #16
+ cmtst v6.2d, v15.2d, v9.2d
+ eor v1.16b, v1.16b, v12.16b
+ subs x22, x22, #0x10
+ eor v16.16b, v5.16b, v4.16b
+ and v25.16b, v6.16b, v8.16b
+ bmi .Lxts_enc_4
+
+ ext v5.16b, v25.16b, v25.16b, #8
+ add v6.2d, v16.2d, v16.2d
+ add x0, x19, #16
+ cmtst v7.2d, v16.2d, v9.2d
+ ldr q4, [x20], #16
+ eor v2.16b, v2.16b, v13.16b
+ str q16, [x0], #16
+ subs x22, x22, #0x10
+ eor v17.16b, v6.16b, v5.16b
+ and v26.16b, v7.16b, v8.16b
+ bmi .Lxts_enc_5
+
+ ext v7.16b, v26.16b, v26.16b, #8
+ add v18.2d, v17.2d, v17.2d
+ ldr q5, [x20], #16
+ eor v3.16b, v3.16b, v14.16b
+ str q17, [x0], #16
+ subs x22, x22, #0x10
+ eor v18.16b, v18.16b, v7.16b
+ bmi .Lxts_enc_6
+
+ ldr q6, [x20], #16
+ eor v4.16b, v4.16b, v15.16b
+ eor v5.16b, v5.16b, v16.16b
+ str q18, [x0] // next round tweak
+ mov x9, sp // pass key schedule
+ mov x10, x1
+ add x0, x19, #16
+ sub x22, x22, #0x10
+ eor v6.16b, v6.16b, v17.16b
+
+ bl _bsaes_encrypt8
+
+ ldr q16, [x0], #16
+ eor v0.16b, v0.16b, v11.16b
+ eor v1.16b, v1.16b, v12.16b
+ ldr q17, [x0], #16
+ eor v4.16b, v4.16b, v13.16b
+ eor v6.16b, v6.16b, v14.16b
+ eor v3.16b, v3.16b, v15.16b
+ ldr q11, [x0] // next round tweak
+ str q0, [x21], #16
+ str q1, [x21], #16
+ eor v0.16b, v7.16b, v16.16b
+ eor v1.16b, v2.16b, v17.16b
+ str q4, [x21], #16
+ str q6, [x21], #16
+ str q3, [x21], #16
+ str q0, [x21], #16
+ str q1, [x21], #16
+ b .Lxts_enc_done
+
+.align 4
+.Lxts_enc_6:
+ eor v4.16b, v4.16b, v15.16b
+ eor v5.16b, v5.16b, v16.16b
+ mov x9, sp // pass key schedule
+ mov x10, x1 // pass rounds
+ add x0, x19, #16
+
+ bl _bsaes_encrypt8
+
+ ldr q16, [x0], #16
+ eor v0.16b, v0.16b, v11.16b
+ eor v1.16b, v1.16b, v12.16b
+ eor v4.16b, v4.16b, v13.16b
+ eor v6.16b, v6.16b, v14.16b
+ ldr q11, [x0] // next round tweak
+ eor v3.16b, v3.16b, v15.16b
+ str q0, [x21], #16
+ str q1, [x21], #16
+ eor v0.16b, v7.16b, v16.16b
+ str q4, [x21], #16
+ str q6, [x21], #16
+ str q3, [x21], #16
+ str q0, [x21], #16
+ b .Lxts_enc_done
+
+.align 4
+.Lxts_enc_5:
+ eor v3.16b, v3.16b, v14.16b
+ eor v4.16b, v4.16b, v15.16b
+ mov x9, sp // pass key schedule
+ mov x10, x1 // pass rounds
+ add x0, x19, #16
+
+ bl _bsaes_encrypt8
+
+ eor v0.16b, v0.16b, v11.16b
+ eor v1.16b, v1.16b, v12.16b
+ ldr q11, [x0] // next round tweak
+ eor v4.16b, v4.16b, v13.16b
+ eor v6.16b, v6.16b, v14.16b
+ eor v3.16b, v3.16b, v15.16b
+ str q0, [x21], #16
+ str q1, [x21], #16
+ str q4, [x21], #16
+ str q6, [x21], #16
+ str q3, [x21], #16
+ b .Lxts_enc_done
+
+.align 4
+.Lxts_enc_4:
+ eor v2.16b, v2.16b, v13.16b
+ eor v3.16b, v3.16b, v14.16b
+ mov x9, sp // pass key schedule
+ mov x10, x1 // pass rounds
+ add x0, x19, #16
+
+ bl _bsaes_encrypt8
+
+ eor v0.16b, v0.16b, v11.16b
+ eor v1.16b, v1.16b, v12.16b
+ eor v4.16b, v4.16b, v13.16b
+ eor v6.16b, v6.16b, v14.16b
+ mov v11.16b, v15.16b // next round tweak
+ str q0, [x21], #16
+ str q1, [x21], #16
+ str q4, [x21], #16
+ str q6, [x21], #16
+ b .Lxts_enc_done
+
+.align 4
+.Lxts_enc_3:
+ eor v1.16b, v1.16b, v12.16b
+ eor v2.16b, v2.16b, v13.16b
+ mov x9, sp // pass key schedule
+ mov x10, x1 // pass rounds
+ add x0, x19, #16
+
+ bl _bsaes_encrypt8
+
+ eor v0.16b, v0.16b, v11.16b
+ eor v1.16b, v1.16b, v12.16b
+ eor v4.16b, v4.16b, v13.16b
+ mov v11.16b, v14.16b // next round tweak
+ str q0, [x21], #16
+ str q1, [x21], #16
+ str q4, [x21], #16
+ b .Lxts_enc_done
+
+.align 4
+.Lxts_enc_2:
+ eor v0.16b, v0.16b, v11.16b
+ eor v1.16b, v1.16b, v12.16b
+ mov x9, sp // pass key schedule
+ mov x10, x1 // pass rounds
+ add x0, x19, #16
+
+ bl _bsaes_encrypt8
+
+ eor v0.16b, v0.16b, v11.16b
+ eor v1.16b, v1.16b, v12.16b
+ mov v11.16b, v13.16b // next round tweak
+ str q0, [x21], #16
+ str q1, [x21], #16
+ b .Lxts_enc_done
+
+.align 4
+.Lxts_enc_1:
+ eor v0.16b, v0.16b, v11.16b
+ sub x0, sp, #16
+ sub x1, sp, #16
+ mov x2, x23
+ mov v13.d[0], v11.d[1] // just in case AES_encrypt corrupts top half of callee-saved SIMD registers
+ mov v14.d[0], v12.d[1]
+ str q0, [sp, #-16]!
+
+ bl AES_encrypt
+
+ ldr q0, [sp], #16
+ trn1 v13.2d, v11.2d, v13.2d
+ trn1 v11.2d, v12.2d, v14.2d // next round tweak
+ eor v0.16b, v0.16b, v13.16b
+ str q0, [x21], #16
+
+.Lxts_enc_done:
+ adds x22, x22, #0x10
+ beq .Lxts_enc_ret
+
+ sub x6, x21, #0x10
+ // Penultimate plaintext block produces final ciphertext part-block
+ // plus remaining part of final plaintext block. Move ciphertext part
+ // to final position and reuse penultimate ciphertext block buffer to
+ // construct final plaintext block
+.Lxts_enc_steal:
+ ldrb w0, [x20], #1
+ ldrb w1, [x21, #-0x10]
+ strb w0, [x21, #-0x10]
+ strb w1, [x21], #1
+
+ subs x22, x22, #1
+ bhi .Lxts_enc_steal
+
+ // Finally encrypt the penultimate ciphertext block using the
+ // last tweak
+ ldr q0, [x6]
+ eor v0.16b, v0.16b, v11.16b
+ str q0, [sp, #-16]!
+ mov x0, sp
+ mov x1, sp
+ mov x2, x23
+ mov x21, x6
+ mov v13.d[0], v11.d[1] // just in case AES_encrypt corrupts top half of callee-saved SIMD registers
+
+ bl AES_encrypt
+
+ trn1 v11.2d, v11.2d, v13.2d
+ ldr q0, [sp], #16
+ eor v0.16b, v0.16b, v11.16b
+ str q0, [x21]
+
+.Lxts_enc_ret:
+
+ movi v0.16b, #0
+ movi v1.16b, #0
+.Lxts_enc_bzero: // wipe key schedule
+ stp q0, q1, [sp], #32
+ cmp sp, x19
+ bne .Lxts_enc_bzero
+
+ ldp x19, x20, [sp, #80]
+ ldp x21, x22, [sp, #96]
+ ldr x23, [sp, #112]
+ ldp d8, d9, [sp, #128]
+ ldp d10, d11, [sp, #144]
+ ldp d12, d13, [sp, #160]
+ ldp d14, d15, [sp, #176]
+ ldp x29, x30, [sp], #192
+ ret
+.size ossl_bsaes_xts_encrypt,.-ossl_bsaes_xts_encrypt
+
+// The assembler doesn't seem capable of de-duplicating these when expressed
+// using `ldr qd,=` syntax, so assign a symbolic address
+.align 5
+.Lxts_magic:
+.quad 1, 0x87, 0x4000000000000000, 0x4000000000000000
+
+.globl ossl_bsaes_xts_decrypt
+.type ossl_bsaes_xts_decrypt,%function
+.align 4
+// On entry:
+// x0 -> input ciphertext
+// x1 -> output plaintext
+// x2 -> length of text in bytes (must be at least 16)
+// x3 -> key1 (used to decrypt the XORed ciphertext blocks)
+// x4 -> key2 (used to encrypt the initial vector to yield the initial tweak)
+// x5 -> 16-byte initial vector (typically, sector number)
+// On exit:
+// Output plaintext filled in
+// No output registers, usual AAPCS64 register preservation
+ossl_bsaes_xts_decrypt:
+ AARCH64_VALID_CALL_TARGET
+ // Stack layout:
+ // sp ->
+ // nrounds*128-96 bytes: key schedule
+ // x19 ->
+ // 16 bytes: frame record
+ // 4*16 bytes: tweak storage across _bsaes_decrypt8
+ // 6*8 bytes: storage for 5 callee-saved general-purpose registers
+ // 8*8 bytes: storage for 8 callee-saved SIMD registers
+ stp x29, x30, [sp, #-192]!
+ stp x19, x20, [sp, #80]
+ stp x21, x22, [sp, #96]
+ str x23, [sp, #112]
+ stp d8, d9, [sp, #128]
+ stp d10, d11, [sp, #144]
+ stp d12, d13, [sp, #160]
+ stp d14, d15, [sp, #176]
+
+ mov x19, sp
+ mov x20, x0
+ mov x21, x1
+ mov x22, x2
+ mov x23, x3
+
+ // generate initial tweak
+ sub sp, sp, #16
+ mov x0, x5 // iv[]
+ mov x1, sp
+ mov x2, x4 // key2
+ bl AES_encrypt
+ ldr q11, [sp], #16
+
+ ldr w1, [x23, #240] // get # of rounds
+ // allocate the key schedule on the stack
+ add x17, sp, #96
+ sub x17, x17, x1, lsl #7 // 128 bytes per inner round key, less 96 bytes
+
+ // populate the key schedule
+ mov x9, x23 // pass key
+ mov x10, x1 // pass # of rounds
+ mov sp, x17
+ bl _bsaes_key_convert
+ ldr q6, [sp]
+ str q15, [x17] // save last round key
+ eor v6.16b, v6.16b, v7.16b // fix up round 0 key (by XORing with 0x63)
+ str q6, [sp]
+
+ sub x30, x22, #0x10
+ tst x22, #0xf // if not multiple of 16
+ csel x22, x30, x22, ne // subtract another 16 bytes
+ subs x22, x22, #0x80
+
+ blo .Lxts_dec_short
+ b .Lxts_dec_loop
+
+.align 4
+.Lxts_dec_loop:
+ ldr q8, .Lxts_magic
+ mov x10, x1 // pass rounds
+ add x2, x19, #16
+ ldr q0, [x20], #16
+ sshr v1.2d, v11.2d, #63
+ mov x9, sp // pass key schedule
+ ldr q6, .Lxts_magic+16
+ add v2.2d, v11.2d, v11.2d
+ cmtst v3.2d, v11.2d, v6.2d
+ and v1.16b, v1.16b, v8.16b
+ ext v1.16b, v1.16b, v1.16b, #8
+ and v3.16b, v3.16b, v8.16b
+ ldr q4, [x20], #16
+ eor v12.16b, v2.16b, v1.16b
+ eor v1.16b, v4.16b, v12.16b
+ eor v0.16b, v0.16b, v11.16b
+ cmtst v2.2d, v12.2d, v6.2d
+ add v4.2d, v12.2d, v12.2d
+ add x0, x19, #16
+ ext v3.16b, v3.16b, v3.16b, #8
+ and v2.16b, v2.16b, v8.16b
+ eor v13.16b, v4.16b, v3.16b
+ ldr q3, [x20], #16
+ ext v4.16b, v2.16b, v2.16b, #8
+ eor v2.16b, v3.16b, v13.16b
+ ldr q3, [x20], #16
+ add v5.2d, v13.2d, v13.2d
+ cmtst v7.2d, v13.2d, v6.2d
+ and v7.16b, v7.16b, v8.16b
+ ldr q9, [x20], #16
+ ext v7.16b, v7.16b, v7.16b, #8
+ ldr q10, [x20], #16
+ eor v14.16b, v5.16b, v4.16b
+ ldr q16, [x20], #16
+ add v4.2d, v14.2d, v14.2d
+ eor v3.16b, v3.16b, v14.16b
+ eor v15.16b, v4.16b, v7.16b
+ add v5.2d, v15.2d, v15.2d
+ ldr q7, [x20], #16
+ cmtst v4.2d, v14.2d, v6.2d
+ and v17.16b, v4.16b, v8.16b
+ cmtst v18.2d, v15.2d, v6.2d
+ eor v4.16b, v9.16b, v15.16b
+ ext v9.16b, v17.16b, v17.16b, #8
+ eor v9.16b, v5.16b, v9.16b
+ add v17.2d, v9.2d, v9.2d
+ and v18.16b, v18.16b, v8.16b
+ eor v5.16b, v10.16b, v9.16b
+ str q9, [x2], #16
+ ext v10.16b, v18.16b, v18.16b, #8
+ cmtst v9.2d, v9.2d, v6.2d
+ and v9.16b, v9.16b, v8.16b
+ eor v10.16b, v17.16b, v10.16b
+ cmtst v17.2d, v10.2d, v6.2d
+ eor v6.16b, v16.16b, v10.16b
+ str q10, [x2], #16
+ ext v9.16b, v9.16b, v9.16b, #8
+ add v10.2d, v10.2d, v10.2d
+ eor v9.16b, v10.16b, v9.16b
+ str q9, [x2], #16
+ eor v7.16b, v7.16b, v9.16b
+ add v9.2d, v9.2d, v9.2d
+ and v8.16b, v17.16b, v8.16b
+ ext v8.16b, v8.16b, v8.16b, #8
+ eor v8.16b, v9.16b, v8.16b
+ str q8, [x2] // next round tweak
+
+ bl _bsaes_decrypt8
+
+ eor v6.16b, v6.16b, v13.16b
+ eor v0.16b, v0.16b, v11.16b
+ ldr q8, [x0], #16
+ eor v7.16b, v7.16b, v8.16b
+ str q0, [x21], #16
+ eor v0.16b, v1.16b, v12.16b
+ ldr q1, [x0], #16
+ eor v1.16b, v3.16b, v1.16b
+ subs x22, x22, #0x80
+ eor v2.16b, v2.16b, v15.16b
+ eor v3.16b, v4.16b, v14.16b
+ ldr q4, [x0], #16
+ str q0, [x21], #16
+ ldr q11, [x0] // next round tweak
+ eor v0.16b, v5.16b, v4.16b
+ str q6, [x21], #16
+ str q3, [x21], #16
+ str q2, [x21], #16
+ str q7, [x21], #16
+ str q1, [x21], #16
+ str q0, [x21], #16
+ bpl .Lxts_dec_loop
+
+.Lxts_dec_short:
+ adds x22, x22, #0x70
+ bmi .Lxts_dec_done
+
+ ldr q8, .Lxts_magic
+ sshr v1.2d, v11.2d, #63
+ add v2.2d, v11.2d, v11.2d
+ ldr q9, .Lxts_magic+16
+ subs x22, x22, #0x10
+ ldr q0, [x20], #16
+ and v1.16b, v1.16b, v8.16b
+ cmtst v3.2d, v11.2d, v9.2d
+ ext v1.16b, v1.16b, v1.16b, #8
+ and v3.16b, v3.16b, v8.16b
+ eor v12.16b, v2.16b, v1.16b
+ ext v1.16b, v3.16b, v3.16b, #8
+ add v2.2d, v12.2d, v12.2d
+ cmtst v3.2d, v12.2d, v9.2d
+ eor v13.16b, v2.16b, v1.16b
+ and v22.16b, v3.16b, v8.16b
+ bmi .Lxts_dec_1
+
+ ext v2.16b, v22.16b, v22.16b, #8
+ add v3.2d, v13.2d, v13.2d
+ ldr q1, [x20], #16
+ cmtst v4.2d, v13.2d, v9.2d
+ subs x22, x22, #0x10
+ eor v14.16b, v3.16b, v2.16b
+ and v23.16b, v4.16b, v8.16b
+ bmi .Lxts_dec_2
+
+ ext v3.16b, v23.16b, v23.16b, #8
+ add v4.2d, v14.2d, v14.2d
+ ldr q2, [x20], #16
+ cmtst v5.2d, v14.2d, v9.2d
+ eor v0.16b, v0.16b, v11.16b
+ subs x22, x22, #0x10
+ eor v15.16b, v4.16b, v3.16b
+ and v24.16b, v5.16b, v8.16b
+ bmi .Lxts_dec_3
+
+ ext v4.16b, v24.16b, v24.16b, #8
+ add v5.2d, v15.2d, v15.2d
+ ldr q3, [x20], #16
+ cmtst v6.2d, v15.2d, v9.2d
+ eor v1.16b, v1.16b, v12.16b
+ subs x22, x22, #0x10
+ eor v16.16b, v5.16b, v4.16b
+ and v25.16b, v6.16b, v8.16b
+ bmi .Lxts_dec_4
+
+ ext v5.16b, v25.16b, v25.16b, #8
+ add v6.2d, v16.2d, v16.2d
+ add x0, x19, #16
+ cmtst v7.2d, v16.2d, v9.2d
+ ldr q4, [x20], #16
+ eor v2.16b, v2.16b, v13.16b
+ str q16, [x0], #16
+ subs x22, x22, #0x10
+ eor v17.16b, v6.16b, v5.16b
+ and v26.16b, v7.16b, v8.16b
+ bmi .Lxts_dec_5
+
+ ext v7.16b, v26.16b, v26.16b, #8
+ add v18.2d, v17.2d, v17.2d
+ ldr q5, [x20], #16
+ eor v3.16b, v3.16b, v14.16b
+ str q17, [x0], #16
+ subs x22, x22, #0x10
+ eor v18.16b, v18.16b, v7.16b
+ bmi .Lxts_dec_6
+
+ ldr q6, [x20], #16
+ eor v4.16b, v4.16b, v15.16b
+ eor v5.16b, v5.16b, v16.16b
+ str q18, [x0] // next round tweak
+ mov x9, sp // pass key schedule
+ mov x10, x1
+ add x0, x19, #16
+ sub x22, x22, #0x10
+ eor v6.16b, v6.16b, v17.16b
+
+ bl _bsaes_decrypt8
+
+ ldr q16, [x0], #16
+ eor v0.16b, v0.16b, v11.16b
+ eor v1.16b, v1.16b, v12.16b
+ ldr q17, [x0], #16
+ eor v6.16b, v6.16b, v13.16b
+ eor v4.16b, v4.16b, v14.16b
+ eor v2.16b, v2.16b, v15.16b
+ ldr q11, [x0] // next round tweak
+ str q0, [x21], #16
+ str q1, [x21], #16
+ eor v0.16b, v7.16b, v16.16b
+ eor v1.16b, v3.16b, v17.16b
+ str q6, [x21], #16
+ str q4, [x21], #16
+ str q2, [x21], #16
+ str q0, [x21], #16
+ str q1, [x21], #16
+ b .Lxts_dec_done
+
+.align 4
+.Lxts_dec_6:
+ eor v4.16b, v4.16b, v15.16b
+ eor v5.16b, v5.16b, v16.16b
+ mov x9, sp // pass key schedule
+ mov x10, x1 // pass rounds
+ add x0, x19, #16
+
+ bl _bsaes_decrypt8
+
+ ldr q16, [x0], #16
+ eor v0.16b, v0.16b, v11.16b
+ eor v1.16b, v1.16b, v12.16b
+ eor v6.16b, v6.16b, v13.16b
+ eor v4.16b, v4.16b, v14.16b
+ ldr q11, [x0] // next round tweak
+ eor v2.16b, v2.16b, v15.16b
+ str q0, [x21], #16
+ str q1, [x21], #16
+ eor v0.16b, v7.16b, v16.16b
+ str q6, [x21], #16
+ str q4, [x21], #16
+ str q2, [x21], #16
+ str q0, [x21], #16
+ b .Lxts_dec_done
+
+.align 4
+.Lxts_dec_5:
+ eor v3.16b, v3.16b, v14.16b
+ eor v4.16b, v4.16b, v15.16b
+ mov x9, sp // pass key schedule
+ mov x10, x1 // pass rounds
+ add x0, x19, #16
+
+ bl _bsaes_decrypt8
+
+ eor v0.16b, v0.16b, v11.16b
+ eor v1.16b, v1.16b, v12.16b
+ ldr q11, [x0] // next round tweak
+ eor v6.16b, v6.16b, v13.16b
+ eor v4.16b, v4.16b, v14.16b
+ eor v2.16b, v2.16b, v15.16b
+ str q0, [x21], #16
+ str q1, [x21], #16
+ str q6, [x21], #16
+ str q4, [x21], #16
+ str q2, [x21], #16
+ b .Lxts_dec_done
+
+.align 4
+.Lxts_dec_4:
+ eor v2.16b, v2.16b, v13.16b
+ eor v3.16b, v3.16b, v14.16b
+ mov x9, sp // pass key schedule
+ mov x10, x1 // pass rounds
+ add x0, x19, #16
+
+ bl _bsaes_decrypt8
+
+ eor v0.16b, v0.16b, v11.16b
+ eor v1.16b, v1.16b, v12.16b
+ eor v6.16b, v6.16b, v13.16b
+ eor v4.16b, v4.16b, v14.16b
+ mov v11.16b, v15.16b // next round tweak
+ str q0, [x21], #16
+ str q1, [x21], #16
+ str q6, [x21], #16
+ str q4, [x21], #16
+ b .Lxts_dec_done
+
+.align 4
+.Lxts_dec_3:
+ eor v1.16b, v1.16b, v12.16b
+ eor v2.16b, v2.16b, v13.16b
+ mov x9, sp // pass key schedule
+ mov x10, x1 // pass rounds
+ add x0, x19, #16
+
+ bl _bsaes_decrypt8
+
+ eor v0.16b, v0.16b, v11.16b
+ eor v1.16b, v1.16b, v12.16b
+ eor v6.16b, v6.16b, v13.16b
+ mov v11.16b, v14.16b // next round tweak
+ str q0, [x21], #16
+ str q1, [x21], #16
+ str q6, [x21], #16
+ b .Lxts_dec_done
+
+.align 4
+.Lxts_dec_2:
+ eor v0.16b, v0.16b, v11.16b
+ eor v1.16b, v1.16b, v12.16b
+ mov x9, sp // pass key schedule
+ mov x10, x1 // pass rounds
+ add x0, x19, #16
+
+ bl _bsaes_decrypt8
+
+ eor v0.16b, v0.16b, v11.16b
+ eor v1.16b, v1.16b, v12.16b
+ mov v11.16b, v13.16b // next round tweak
+ str q0, [x21], #16
+ str q1, [x21], #16
+ b .Lxts_dec_done
+
+.align 4
+.Lxts_dec_1:
+ eor v0.16b, v0.16b, v11.16b
+ sub x0, sp, #16
+ sub x1, sp, #16
+ mov x2, x23
+ mov v13.d[0], v11.d[1] // just in case AES_decrypt corrupts top half of callee-saved SIMD registers
+ mov v14.d[0], v12.d[1]
+ str q0, [sp, #-16]!
+
+ bl AES_decrypt
+
+ ldr q0, [sp], #16
+ trn1 v13.2d, v11.2d, v13.2d
+ trn1 v11.2d, v12.2d, v14.2d // next round tweak
+ eor v0.16b, v0.16b, v13.16b
+ str q0, [x21], #16
+
+.Lxts_dec_done:
+ adds x22, x22, #0x10
+ beq .Lxts_dec_ret
+
+ // calculate one round of extra tweak for the stolen ciphertext
+ ldr q8, .Lxts_magic
+ sshr v6.2d, v11.2d, #63
+ and v6.16b, v6.16b, v8.16b
+ add v12.2d, v11.2d, v11.2d
+ ext v6.16b, v6.16b, v6.16b, #8
+ eor v12.16b, v12.16b, v6.16b
+
+ // perform the final decryption with the last tweak value
+ ldr q0, [x20], #16
+ eor v0.16b, v0.16b, v12.16b
+ str q0, [sp, #-16]!
+ mov x0, sp
+ mov x1, sp
+ mov x2, x23
+ mov v13.d[0], v11.d[1] // just in case AES_decrypt corrupts top half of callee-saved SIMD registers
+ mov v14.d[0], v12.d[1]
+
+ bl AES_decrypt
+
+ trn1 v12.2d, v12.2d, v14.2d
+ trn1 v11.2d, v11.2d, v13.2d
+ ldr q0, [sp], #16
+ eor v0.16b, v0.16b, v12.16b
+ str q0, [x21]
+
+ mov x6, x21
+ // Penultimate ciphertext block produces final plaintext part-block
+ // plus remaining part of final ciphertext block. Move plaintext part
+ // to final position and reuse penultimate plaintext block buffer to
+ // construct final ciphertext block
+.Lxts_dec_steal:
+ ldrb w1, [x21]
+ ldrb w0, [x20], #1
+ strb w1, [x21, #0x10]
+ strb w0, [x21], #1
+
+ subs x22, x22, #1
+ bhi .Lxts_dec_steal
+
+ // Finally decrypt the penultimate plaintext block using the
+ // penultimate tweak
+ ldr q0, [x6]
+ eor v0.16b, v0.16b, v11.16b
+ str q0, [sp, #-16]!
+ mov x0, sp
+ mov x1, sp
+ mov x2, x23
+ mov x21, x6
+
+ bl AES_decrypt
+
+ trn1 v11.2d, v11.2d, v13.2d
+ ldr q0, [sp], #16
+ eor v0.16b, v0.16b, v11.16b
+ str q0, [x21]
+
+.Lxts_dec_ret:
+
+ movi v0.16b, #0
+ movi v1.16b, #0
+.Lxts_dec_bzero: // wipe key schedule
+ stp q0, q1, [sp], #32
+ cmp sp, x19
+ bne .Lxts_dec_bzero
+
+ ldp x19, x20, [sp, #80]
+ ldp x21, x22, [sp, #96]
+ ldr x23, [sp, #112]
+ ldp d8, d9, [sp, #128]
+ ldp d10, d11, [sp, #144]
+ ldp d12, d13, [sp, #160]
+ ldp d14, d15, [sp, #176]
+ ldp x29, x30, [sp], #192
+ ret
+.size ossl_bsaes_xts_decrypt,.-ossl_bsaes_xts_decrypt
diff --git a/sys/crypto/openssl/aarch64/chacha-armv8-sve.S b/sys/crypto/openssl/aarch64/chacha-armv8-sve.S
new file mode 100644
index 000000000000..e595adf377f9
--- /dev/null
+++ b/sys/crypto/openssl/aarch64/chacha-armv8-sve.S
@@ -0,0 +1,3559 @@
+/* Do not modify. This file is auto-generated from chacha-armv8-sve.pl. */
+// Copyright 2022-2025 The OpenSSL Project Authors. All Rights Reserved.
+//
+// Licensed under the Apache License 2.0 (the "License"). You may not use
+// this file except in compliance with the License. You can obtain a copy
+// in the file LICENSE in the source distribution or at
+// https://www.openssl.org/source/license.html
+//
+//
+// ChaCha20 for ARMv8 via SVE
+//
+// $output is the last argument if it looks like a file (it has an extension)
+// $flavour is the first argument if it doesn't look like a file
+#include "arm_arch.h"
+
+.arch armv8-a
+
+
+.hidden OPENSSL_armcap_P
+
+.text
+
+.section .rodata
+.align 5
+.type _chacha_sve_consts,%object
+_chacha_sve_consts:
+.Lchacha20_consts:
+.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral
+.Lrot8:
+.word 0x02010003,0x04040404,0x02010003,0x04040404
+.size _chacha_sve_consts,.-_chacha_sve_consts
+
+.previous
+
+.globl ChaCha20_ctr32_sve
+.type ChaCha20_ctr32_sve,%function
+.align 5
+ChaCha20_ctr32_sve:
+ AARCH64_VALID_CALL_TARGET
+.inst 0x04a0e3e5 //cntw x5, ALL, MUL #1
+ cmp x2,x5,lsl #6
+ b.lt .Lreturn
+ mov x7,0
+ adrp x6,OPENSSL_armcap_P
+ ldr w6,[x6,#:lo12:OPENSSL_armcap_P]
+ tst w6,#ARMV8_SVE2
+ b.eq 1f
+ mov x7,1
+ b 2f
+1:
+ cmp x5,4
+ b.le .Lreturn
+ adrp x6,.Lrot8
+ add x6,x6,#:lo12:.Lrot8
+ ldp w9,w10,[x6]
+.inst 0x04aa4d3f //index z31.s,w9,w10
+2:
+ AARCH64_SIGN_LINK_REGISTER
+ stp d8,d9,[sp,-192]!
+ stp d10,d11,[sp,16]
+ stp d12,d13,[sp,32]
+ stp d14,d15,[sp,48]
+ stp x16,x17,[sp,64]
+ stp x18,x19,[sp,80]
+ stp x20,x21,[sp,96]
+ stp x22,x23,[sp,112]
+ stp x24,x25,[sp,128]
+ stp x26,x27,[sp,144]
+ stp x28,x29,[sp,160]
+ str x30,[sp,176]
+
+ adrp x6,.Lchacha20_consts
+ add x6,x6,#:lo12:.Lchacha20_consts
+ ldp x23,x24,[x6]
+ ldp x25,x26,[x3]
+ ldp x27,x28,[x3, 16]
+ ldp x29,x30,[x4]
+.inst 0x2599e3e0 //ptrues p0.s,ALL
+#ifdef __AARCH64EB__
+ ror x25,x25,#32
+ ror x26,x26,#32
+ ror x27,x27,#32
+ ror x28,x28,#32
+ ror x29,x29,#32
+ ror x30,x30,#32
+#endif
+ cbz x7, 1f
+.align 5
+100:
+ subs x7,x2,x5,lsl #6
+ b.lt 110f
+ mov x2,x7
+ b.eq 101f
+ cmp x2,64
+ b.lt 101f
+ mixin=1
+ lsr x8,x23,#32
+.inst 0x05a03ae0 //dup z0.s,w23
+.inst 0x05a03af9 //dup z25.s,w23
+.if mixin == 1
+ mov w7,w23
+.endif
+.inst 0x05a03904 //dup z4.s,w8
+.inst 0x05a0391a //dup z26.s,w8
+ lsr x10,x24,#32
+.inst 0x05a03b08 //dup z8.s,w24
+.inst 0x05a03b1b //dup z27.s,w24
+.if mixin == 1
+ mov w9,w24
+.endif
+.inst 0x05a0394c //dup z12.s,w10
+.inst 0x05a0395c //dup z28.s,w10
+ lsr x12,x25,#32
+.inst 0x05a03b21 //dup z1.s,w25
+.inst 0x05a03b3d //dup z29.s,w25
+.if mixin == 1
+ mov w11,w25
+.endif
+.inst 0x05a03985 //dup z5.s,w12
+.inst 0x05a0399e //dup z30.s,w12
+ lsr x14,x26,#32
+.inst 0x05a03b49 //dup z9.s,w26
+.inst 0x05a03b55 //dup z21.s,w26
+.if mixin == 1
+ mov w13,w26
+.endif
+.inst 0x05a039cd //dup z13.s,w14
+.inst 0x05a039d6 //dup z22.s,w14
+ lsr x16,x27,#32
+.inst 0x05a03b62 //dup z2.s,w27
+.inst 0x05a03b77 //dup z23.s,w27
+.if mixin == 1
+ mov w15,w27
+.endif
+.inst 0x05a03a06 //dup z6.s,w16
+.inst 0x05a03a18 //dup z24.s,w16
+ lsr x18,x28,#32
+.inst 0x05a03b8a //dup z10.s,w28
+.inst 0x05a03b91 //dup z17.s,w28
+.if mixin == 1
+ mov w17,w28
+.endif
+.inst 0x05a03a4e //dup z14.s,w18
+.inst 0x05a03a52 //dup z18.s,w18
+ lsr x22,x30,#32
+.inst 0x05a03bcb //dup z11.s,w30
+.inst 0x05a03bd4 //dup z20.s,w30
+.if mixin == 1
+ mov w21,w30
+.endif
+.inst 0x05a03acf //dup z15.s,w22
+.inst 0x05a03adf //dup z31.s,w22
+.if mixin == 1
+ add w20,w29,#1
+ mov w19,w29
+.inst 0x04a14690 //index z16.s,w20,1
+.inst 0x04a14683 //index z3.s,w20,1
+.else
+.inst 0x04a147b0 //index z16.s,w29,1
+.inst 0x04a147a3 //index z3.s,w29,1
+.endif
+ lsr x20,x29,#32
+.inst 0x05a03a87 //dup z7.s,w20
+.inst 0x05a03a93 //dup z19.s,w20
+ mov x6,#10
+10:
+.align 5
+.inst 0x04a10000 //add z0.s,z0.s,z1.s
+.if mixin == 1
+ add w7,w7,w11
+.endif
+.inst 0x04a50084 //add z4.s,z4.s,z5.s
+.if mixin == 1
+ add w8,w8,w12
+.endif
+.inst 0x04a90108 //add z8.s,z8.s,z9.s
+.if mixin == 1
+ add w9,w9,w13
+.endif
+.inst 0x04ad018c //add z12.s,z12.s,z13.s
+.if mixin == 1
+ add w10,w10,w14
+.endif
+.if mixin == 1
+ eor w19,w19,w7
+.endif
+.inst 0x04703403 //xar z3.s,z3.s,z0.s,16
+.if mixin == 1
+ ror w19,w19,16
+.endif
+.if mixin == 1
+ eor w20,w20,w8
+.endif
+.inst 0x04703487 //xar z7.s,z7.s,z4.s,16
+.if mixin == 1
+ ror w20,w20,16
+.endif
+.if mixin == 1
+ eor w21,w21,w9
+.endif
+.inst 0x0470350b //xar z11.s,z11.s,z8.s,16
+.if mixin == 1
+ ror w21,w21,16
+.endif
+.if mixin == 1
+ eor w22,w22,w10
+.endif
+.inst 0x0470358f //xar z15.s,z15.s,z12.s,16
+.if mixin == 1
+ ror w22,w22,16
+.endif
+.inst 0x04a30042 //add z2.s,z2.s,z3.s
+.if mixin == 1
+ add w15,w15,w19
+.endif
+.inst 0x04a700c6 //add z6.s,z6.s,z7.s
+.if mixin == 1
+ add w16,w16,w20
+.endif
+.inst 0x04ab014a //add z10.s,z10.s,z11.s
+.if mixin == 1
+ add w17,w17,w21
+.endif
+.inst 0x04af01ce //add z14.s,z14.s,z15.s
+.if mixin == 1
+ add w18,w18,w22
+.endif
+.if mixin == 1
+ eor w11,w11,w15
+.endif
+.inst 0x046c3441 //xar z1.s,z1.s,z2.s,20
+.if mixin == 1
+ ror w11,w11,20
+.endif
+.if mixin == 1
+ eor w12,w12,w16
+.endif
+.inst 0x046c34c5 //xar z5.s,z5.s,z6.s,20
+.if mixin == 1
+ ror w12,w12,20
+.endif
+.if mixin == 1
+ eor w13,w13,w17
+.endif
+.inst 0x046c3549 //xar z9.s,z9.s,z10.s,20
+.if mixin == 1
+ ror w13,w13,20
+.endif
+.if mixin == 1
+ eor w14,w14,w18
+.endif
+.inst 0x046c35cd //xar z13.s,z13.s,z14.s,20
+.if mixin == 1
+ ror w14,w14,20
+.endif
+.inst 0x04a10000 //add z0.s,z0.s,z1.s
+.if mixin == 1
+ add w7,w7,w11
+.endif
+.inst 0x04a50084 //add z4.s,z4.s,z5.s
+.if mixin == 1
+ add w8,w8,w12
+.endif
+.inst 0x04a90108 //add z8.s,z8.s,z9.s
+.if mixin == 1
+ add w9,w9,w13
+.endif
+.inst 0x04ad018c //add z12.s,z12.s,z13.s
+.if mixin == 1
+ add w10,w10,w14
+.endif
+.if mixin == 1
+ eor w19,w19,w7
+.endif
+.inst 0x04683403 //xar z3.s,z3.s,z0.s,24
+.if mixin == 1
+ ror w19,w19,24
+.endif
+.if mixin == 1
+ eor w20,w20,w8
+.endif
+.inst 0x04683487 //xar z7.s,z7.s,z4.s,24
+.if mixin == 1
+ ror w20,w20,24
+.endif
+.if mixin == 1
+ eor w21,w21,w9
+.endif
+.inst 0x0468350b //xar z11.s,z11.s,z8.s,24
+.if mixin == 1
+ ror w21,w21,24
+.endif
+.if mixin == 1
+ eor w22,w22,w10
+.endif
+.inst 0x0468358f //xar z15.s,z15.s,z12.s,24
+.if mixin == 1
+ ror w22,w22,24
+.endif
+.inst 0x04a30042 //add z2.s,z2.s,z3.s
+.if mixin == 1
+ add w15,w15,w19
+.endif
+.inst 0x04a700c6 //add z6.s,z6.s,z7.s
+.if mixin == 1
+ add w16,w16,w20
+.endif
+.inst 0x04ab014a //add z10.s,z10.s,z11.s
+.if mixin == 1
+ add w17,w17,w21
+.endif
+.inst 0x04af01ce //add z14.s,z14.s,z15.s
+.if mixin == 1
+ add w18,w18,w22
+.endif
+.if mixin == 1
+ eor w11,w11,w15
+.endif
+.inst 0x04673441 //xar z1.s,z1.s,z2.s,25
+.if mixin == 1
+ ror w11,w11,25
+.endif
+.if mixin == 1
+ eor w12,w12,w16
+.endif
+.inst 0x046734c5 //xar z5.s,z5.s,z6.s,25
+.if mixin == 1
+ ror w12,w12,25
+.endif
+.if mixin == 1
+ eor w13,w13,w17
+.endif
+.inst 0x04673549 //xar z9.s,z9.s,z10.s,25
+.if mixin == 1
+ ror w13,w13,25
+.endif
+.if mixin == 1
+ eor w14,w14,w18
+.endif
+.inst 0x046735cd //xar z13.s,z13.s,z14.s,25
+.if mixin == 1
+ ror w14,w14,25
+.endif
+.inst 0x04a50000 //add z0.s,z0.s,z5.s
+.if mixin == 1
+ add w7,w7,w12
+.endif
+.inst 0x04a90084 //add z4.s,z4.s,z9.s
+.if mixin == 1
+ add w8,w8,w13
+.endif
+.inst 0x04ad0108 //add z8.s,z8.s,z13.s
+.if mixin == 1
+ add w9,w9,w14
+.endif
+.inst 0x04a1018c //add z12.s,z12.s,z1.s
+.if mixin == 1
+ add w10,w10,w11
+.endif
+.if mixin == 1
+ eor w22,w22,w7
+.endif
+.inst 0x0470340f //xar z15.s,z15.s,z0.s,16
+.if mixin == 1
+ ror w22,w22,16
+.endif
+.if mixin == 1
+ eor w19,w19,w8
+.endif
+.inst 0x04703483 //xar z3.s,z3.s,z4.s,16
+.if mixin == 1
+ ror w19,w19,16
+.endif
+.if mixin == 1
+ eor w20,w20,w9
+.endif
+.inst 0x04703507 //xar z7.s,z7.s,z8.s,16
+.if mixin == 1
+ ror w20,w20,16
+.endif
+.if mixin == 1
+ eor w21,w21,w10
+.endif
+.inst 0x0470358b //xar z11.s,z11.s,z12.s,16
+.if mixin == 1
+ ror w21,w21,16
+.endif
+.inst 0x04af014a //add z10.s,z10.s,z15.s
+.if mixin == 1
+ add w17,w17,w22
+.endif
+.inst 0x04a301ce //add z14.s,z14.s,z3.s
+.if mixin == 1
+ add w18,w18,w19
+.endif
+.inst 0x04a70042 //add z2.s,z2.s,z7.s
+.if mixin == 1
+ add w15,w15,w20
+.endif
+.inst 0x04ab00c6 //add z6.s,z6.s,z11.s
+.if mixin == 1
+ add w16,w16,w21
+.endif
+.if mixin == 1
+ eor w12,w12,w17
+.endif
+.inst 0x046c3545 //xar z5.s,z5.s,z10.s,20
+.if mixin == 1
+ ror w12,w12,20
+.endif
+.if mixin == 1
+ eor w13,w13,w18
+.endif
+.inst 0x046c35c9 //xar z9.s,z9.s,z14.s,20
+.if mixin == 1
+ ror w13,w13,20
+.endif
+.if mixin == 1
+ eor w14,w14,w15
+.endif
+.inst 0x046c344d //xar z13.s,z13.s,z2.s,20
+.if mixin == 1
+ ror w14,w14,20
+.endif
+.if mixin == 1
+ eor w11,w11,w16
+.endif
+.inst 0x046c34c1 //xar z1.s,z1.s,z6.s,20
+.if mixin == 1
+ ror w11,w11,20
+.endif
+.inst 0x04a50000 //add z0.s,z0.s,z5.s
+.if mixin == 1
+ add w7,w7,w12
+.endif
+.inst 0x04a90084 //add z4.s,z4.s,z9.s
+.if mixin == 1
+ add w8,w8,w13
+.endif
+.inst 0x04ad0108 //add z8.s,z8.s,z13.s
+.if mixin == 1
+ add w9,w9,w14
+.endif
+.inst 0x04a1018c //add z12.s,z12.s,z1.s
+.if mixin == 1
+ add w10,w10,w11
+.endif
+.if mixin == 1
+ eor w22,w22,w7
+.endif
+.inst 0x0468340f //xar z15.s,z15.s,z0.s,24
+.if mixin == 1
+ ror w22,w22,24
+.endif
+.if mixin == 1
+ eor w19,w19,w8
+.endif
+.inst 0x04683483 //xar z3.s,z3.s,z4.s,24
+.if mixin == 1
+ ror w19,w19,24
+.endif
+.if mixin == 1
+ eor w20,w20,w9
+.endif
+.inst 0x04683507 //xar z7.s,z7.s,z8.s,24
+.if mixin == 1
+ ror w20,w20,24
+.endif
+.if mixin == 1
+ eor w21,w21,w10
+.endif
+.inst 0x0468358b //xar z11.s,z11.s,z12.s,24
+.if mixin == 1
+ ror w21,w21,24
+.endif
+.inst 0x04af014a //add z10.s,z10.s,z15.s
+.if mixin == 1
+ add w17,w17,w22
+.endif
+.inst 0x04a301ce //add z14.s,z14.s,z3.s
+.if mixin == 1
+ add w18,w18,w19
+.endif
+.inst 0x04a70042 //add z2.s,z2.s,z7.s
+.if mixin == 1
+ add w15,w15,w20
+.endif
+.inst 0x04ab00c6 //add z6.s,z6.s,z11.s
+.if mixin == 1
+ add w16,w16,w21
+.endif
+.if mixin == 1
+ eor w12,w12,w17
+.endif
+.inst 0x04673545 //xar z5.s,z5.s,z10.s,25
+.if mixin == 1
+ ror w12,w12,25
+.endif
+.if mixin == 1
+ eor w13,w13,w18
+.endif
+.inst 0x046735c9 //xar z9.s,z9.s,z14.s,25
+.if mixin == 1
+ ror w13,w13,25
+.endif
+.if mixin == 1
+ eor w14,w14,w15
+.endif
+.inst 0x0467344d //xar z13.s,z13.s,z2.s,25
+.if mixin == 1
+ ror w14,w14,25
+.endif
+.if mixin == 1
+ eor w11,w11,w16
+.endif
+.inst 0x046734c1 //xar z1.s,z1.s,z6.s,25
+.if mixin == 1
+ ror w11,w11,25
+.endif
+ sub x6,x6,1
+ cbnz x6,10b
+.if mixin == 1
+ add w7,w7,w23
+.endif
+.inst 0x04b90000 //add z0.s,z0.s,z25.s
+.if mixin == 1
+ add x8,x8,x23,lsr #32
+.endif
+.inst 0x04ba0084 //add z4.s,z4.s,z26.s
+.if mixin == 1
+ add x7,x7,x8,lsl #32 // pack
+.endif
+.if mixin == 1
+ add w9,w9,w24
+.endif
+.inst 0x04bb0108 //add z8.s,z8.s,z27.s
+.if mixin == 1
+ add x10,x10,x24,lsr #32
+.endif
+.inst 0x04bc018c //add z12.s,z12.s,z28.s
+.if mixin == 1
+ add x9,x9,x10,lsl #32 // pack
+.endif
+.if mixin == 1
+ ldp x8,x10,[x1],#16
+.endif
+.if mixin == 1
+ add w11,w11,w25
+.endif
+.inst 0x04bd0021 //add z1.s,z1.s,z29.s
+.if mixin == 1
+ add x12,x12,x25,lsr #32
+.endif
+.inst 0x04be00a5 //add z5.s,z5.s,z30.s
+.if mixin == 1
+ add x11,x11,x12,lsl #32 // pack
+.endif
+.if mixin == 1
+ add w13,w13,w26
+.endif
+.inst 0x04b50129 //add z9.s,z9.s,z21.s
+.if mixin == 1
+ add x14,x14,x26,lsr #32
+.endif
+.inst 0x04b601ad //add z13.s,z13.s,z22.s
+.if mixin == 1
+ add x13,x13,x14,lsl #32 // pack
+.endif
+.if mixin == 1
+ ldp x12,x14,[x1],#16
+.endif
+.if mixin == 1
+ add w15,w15,w27
+.endif
+.inst 0x04b70042 //add z2.s,z2.s,z23.s
+.if mixin == 1
+ add x16,x16,x27,lsr #32
+.endif
+.inst 0x04b800c6 //add z6.s,z6.s,z24.s
+.if mixin == 1
+ add x15,x15,x16,lsl #32 // pack
+.endif
+.if mixin == 1
+ add w17,w17,w28
+.endif
+.inst 0x04b1014a //add z10.s,z10.s,z17.s
+.if mixin == 1
+ add x18,x18,x28,lsr #32
+.endif
+.inst 0x04b201ce //add z14.s,z14.s,z18.s
+.if mixin == 1
+ add x17,x17,x18,lsl #32 // pack
+.endif
+.if mixin == 1
+ ldp x16,x18,[x1],#16
+.endif
+.if mixin == 1
+ add w19,w19,w29
+.endif
+.inst 0x04b00063 //add z3.s,z3.s,z16.s
+.if mixin == 1
+ add x20,x20,x29,lsr #32
+.endif
+.inst 0x04b300e7 //add z7.s,z7.s,z19.s
+.if mixin == 1
+ add x19,x19,x20,lsl #32 // pack
+.endif
+.if mixin == 1
+ add w21,w21,w30
+.endif
+.inst 0x04b4016b //add z11.s,z11.s,z20.s
+.if mixin == 1
+ add x22,x22,x30,lsr #32
+.endif
+.inst 0x04bf01ef //add z15.s,z15.s,z31.s
+.if mixin == 1
+ add x21,x21,x22,lsl #32 // pack
+.endif
+.if mixin == 1
+ ldp x20,x22,[x1],#16
+.endif
+#ifdef __AARCH64EB__
+ rev x7,x7
+.inst 0x05a48000 //revb z0.s,p0/m,z0.s
+.inst 0x05a48084 //revb z4.s,p0/m,z4.s
+ rev x9,x9
+.inst 0x05a48108 //revb z8.s,p0/m,z8.s
+.inst 0x05a4818c //revb z12.s,p0/m,z12.s
+ rev x11,x11
+.inst 0x05a48021 //revb z1.s,p0/m,z1.s
+.inst 0x05a480a5 //revb z5.s,p0/m,z5.s
+ rev x13,x13
+.inst 0x05a48129 //revb z9.s,p0/m,z9.s
+.inst 0x05a481ad //revb z13.s,p0/m,z13.s
+ rev x15,x15
+.inst 0x05a48042 //revb z2.s,p0/m,z2.s
+.inst 0x05a480c6 //revb z6.s,p0/m,z6.s
+ rev x17,x17
+.inst 0x05a4814a //revb z10.s,p0/m,z10.s
+.inst 0x05a481ce //revb z14.s,p0/m,z14.s
+ rev x19,x19
+.inst 0x05a48063 //revb z3.s,p0/m,z3.s
+.inst 0x05a480e7 //revb z7.s,p0/m,z7.s
+ rev x21,x21
+.inst 0x05a4816b //revb z11.s,p0/m,z11.s
+.inst 0x05a481ef //revb z15.s,p0/m,z15.s
+#endif
+.if mixin == 1
+ add x29,x29,#1
+.endif
+ cmp x5,4
+ b.ne 200f
+.if mixin == 1
+ eor x7,x7,x8
+.endif
+.if mixin == 1
+ eor x9,x9,x10
+.endif
+.if mixin == 1
+ eor x11,x11,x12
+.endif
+.inst 0x05a46011 //zip1 z17.s,z0.s,z4.s
+.inst 0x05a46412 //zip2 z18.s,z0.s,z4.s
+.inst 0x05ac6113 //zip1 z19.s,z8.s,z12.s
+.inst 0x05ac6514 //zip2 z20.s,z8.s,z12.s
+
+.inst 0x05a56035 //zip1 z21.s,z1.s,z5.s
+.inst 0x05a56436 //zip2 z22.s,z1.s,z5.s
+.inst 0x05ad6137 //zip1 z23.s,z9.s,z13.s
+.inst 0x05ad6538 //zip2 z24.s,z9.s,z13.s
+
+.inst 0x05f36220 //zip1 z0.d,z17.d,z19.d
+.inst 0x05f36624 //zip2 z4.d,z17.d,z19.d
+.inst 0x05f46248 //zip1 z8.d,z18.d,z20.d
+.inst 0x05f4664c //zip2 z12.d,z18.d,z20.d
+
+.inst 0x05f762a1 //zip1 z1.d,z21.d,z23.d
+.inst 0x05f766a5 //zip2 z5.d,z21.d,z23.d
+.inst 0x05f862c9 //zip1 z9.d,z22.d,z24.d
+.inst 0x05f866cd //zip2 z13.d,z22.d,z24.d
+.if mixin == 1
+ eor x13,x13,x14
+.endif
+.if mixin == 1
+ eor x15,x15,x16
+.endif
+.if mixin == 1
+ eor x17,x17,x18
+.endif
+.inst 0x05a66051 //zip1 z17.s,z2.s,z6.s
+.inst 0x05a66452 //zip2 z18.s,z2.s,z6.s
+.inst 0x05ae6153 //zip1 z19.s,z10.s,z14.s
+.inst 0x05ae6554 //zip2 z20.s,z10.s,z14.s
+
+.inst 0x05a76075 //zip1 z21.s,z3.s,z7.s
+.inst 0x05a76476 //zip2 z22.s,z3.s,z7.s
+.inst 0x05af6177 //zip1 z23.s,z11.s,z15.s
+.inst 0x05af6578 //zip2 z24.s,z11.s,z15.s
+
+.inst 0x05f36222 //zip1 z2.d,z17.d,z19.d
+.inst 0x05f36626 //zip2 z6.d,z17.d,z19.d
+.inst 0x05f4624a //zip1 z10.d,z18.d,z20.d
+.inst 0x05f4664e //zip2 z14.d,z18.d,z20.d
+
+.inst 0x05f762a3 //zip1 z3.d,z21.d,z23.d
+.inst 0x05f766a7 //zip2 z7.d,z21.d,z23.d
+.inst 0x05f862cb //zip1 z11.d,z22.d,z24.d
+.inst 0x05f866cf //zip2 z15.d,z22.d,z24.d
+.if mixin == 1
+ eor x19,x19,x20
+.endif
+.if mixin == 1
+ eor x21,x21,x22
+.endif
+ ld1 {v17.4s,v18.4s,v19.4s,v20.4s},[x1],#64
+ ld1 {v21.4s,v22.4s,v23.4s,v24.4s},[x1],#64
+.inst 0x04b13000 //eor z0.d,z0.d,z17.d
+.inst 0x04b23021 //eor z1.d,z1.d,z18.d
+.inst 0x04b33042 //eor z2.d,z2.d,z19.d
+.inst 0x04b43063 //eor z3.d,z3.d,z20.d
+.inst 0x04b53084 //eor z4.d,z4.d,z21.d
+.inst 0x04b630a5 //eor z5.d,z5.d,z22.d
+.inst 0x04b730c6 //eor z6.d,z6.d,z23.d
+.inst 0x04b830e7 //eor z7.d,z7.d,z24.d
+ ld1 {v17.4s,v18.4s,v19.4s,v20.4s},[x1],#64
+ ld1 {v21.4s,v22.4s,v23.4s,v24.4s},[x1],#64
+.if mixin == 1
+ stp x7,x9,[x0],#16
+.endif
+.inst 0x04b13108 //eor z8.d,z8.d,z17.d
+.inst 0x04b23129 //eor z9.d,z9.d,z18.d
+.if mixin == 1
+ stp x11,x13,[x0],#16
+.endif
+.inst 0x04b3314a //eor z10.d,z10.d,z19.d
+.inst 0x04b4316b //eor z11.d,z11.d,z20.d
+.if mixin == 1
+ stp x15,x17,[x0],#16
+.endif
+.inst 0x04b5318c //eor z12.d,z12.d,z21.d
+.inst 0x04b631ad //eor z13.d,z13.d,z22.d
+.if mixin == 1
+ stp x19,x21,[x0],#16
+.endif
+.inst 0x04b731ce //eor z14.d,z14.d,z23.d
+.inst 0x04b831ef //eor z15.d,z15.d,z24.d
+ st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x0],#64
+ st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
+ st1 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64
+ st1 {v12.4s,v13.4s,v14.4s,v15.4s},[x0],#64
+ b 210f
+200:
+.inst 0x05a16011 //zip1 z17.s,z0.s,z1.s
+.inst 0x05a16412 //zip2 z18.s,z0.s,z1.s
+.inst 0x05a36053 //zip1 z19.s,z2.s,z3.s
+.inst 0x05a36454 //zip2 z20.s,z2.s,z3.s
+
+.inst 0x05a56095 //zip1 z21.s,z4.s,z5.s
+.inst 0x05a56496 //zip2 z22.s,z4.s,z5.s
+.inst 0x05a760d7 //zip1 z23.s,z6.s,z7.s
+.inst 0x05a764d8 //zip2 z24.s,z6.s,z7.s
+
+.inst 0x05f36220 //zip1 z0.d,z17.d,z19.d
+.inst 0x05f36621 //zip2 z1.d,z17.d,z19.d
+.inst 0x05f46242 //zip1 z2.d,z18.d,z20.d
+.inst 0x05f46643 //zip2 z3.d,z18.d,z20.d
+
+.inst 0x05f762a4 //zip1 z4.d,z21.d,z23.d
+.inst 0x05f766a5 //zip2 z5.d,z21.d,z23.d
+.inst 0x05f862c6 //zip1 z6.d,z22.d,z24.d
+.inst 0x05f866c7 //zip2 z7.d,z22.d,z24.d
+.if mixin == 1
+ eor x7,x7,x8
+.endif
+.if mixin == 1
+ eor x9,x9,x10
+.endif
+.inst 0x05a96111 //zip1 z17.s,z8.s,z9.s
+.inst 0x05a96512 //zip2 z18.s,z8.s,z9.s
+.inst 0x05ab6153 //zip1 z19.s,z10.s,z11.s
+.inst 0x05ab6554 //zip2 z20.s,z10.s,z11.s
+
+.inst 0x05ad6195 //zip1 z21.s,z12.s,z13.s
+.inst 0x05ad6596 //zip2 z22.s,z12.s,z13.s
+.inst 0x05af61d7 //zip1 z23.s,z14.s,z15.s
+.inst 0x05af65d8 //zip2 z24.s,z14.s,z15.s
+
+.inst 0x05f36228 //zip1 z8.d,z17.d,z19.d
+.inst 0x05f36629 //zip2 z9.d,z17.d,z19.d
+.inst 0x05f4624a //zip1 z10.d,z18.d,z20.d
+.inst 0x05f4664b //zip2 z11.d,z18.d,z20.d
+
+.inst 0x05f762ac //zip1 z12.d,z21.d,z23.d
+.inst 0x05f766ad //zip2 z13.d,z21.d,z23.d
+.inst 0x05f862ce //zip1 z14.d,z22.d,z24.d
+.inst 0x05f866cf //zip2 z15.d,z22.d,z24.d
+.if mixin == 1
+ eor x11,x11,x12
+.endif
+.if mixin == 1
+ eor x13,x13,x14
+.endif
+.inst 0x05a46011 //zip1 z17.s,z0.s,z4.s
+.inst 0x05a46412 //zip2 z18.s,z0.s,z4.s
+.inst 0x05ac6113 //zip1 z19.s,z8.s,z12.s
+.inst 0x05ac6514 //zip2 z20.s,z8.s,z12.s
+
+.inst 0x05a56035 //zip1 z21.s,z1.s,z5.s
+.inst 0x05a56436 //zip2 z22.s,z1.s,z5.s
+.inst 0x05ad6137 //zip1 z23.s,z9.s,z13.s
+.inst 0x05ad6538 //zip2 z24.s,z9.s,z13.s
+
+.inst 0x05f36220 //zip1 z0.d,z17.d,z19.d
+.inst 0x05f36624 //zip2 z4.d,z17.d,z19.d
+.inst 0x05f46248 //zip1 z8.d,z18.d,z20.d
+.inst 0x05f4664c //zip2 z12.d,z18.d,z20.d
+
+.inst 0x05f762a1 //zip1 z1.d,z21.d,z23.d
+.inst 0x05f766a5 //zip2 z5.d,z21.d,z23.d
+.inst 0x05f862c9 //zip1 z9.d,z22.d,z24.d
+.inst 0x05f866cd //zip2 z13.d,z22.d,z24.d
+.if mixin == 1
+ eor x15,x15,x16
+.endif
+.if mixin == 1
+ eor x17,x17,x18
+.endif
+.inst 0x05a66051 //zip1 z17.s,z2.s,z6.s
+.inst 0x05a66452 //zip2 z18.s,z2.s,z6.s
+.inst 0x05ae6153 //zip1 z19.s,z10.s,z14.s
+.inst 0x05ae6554 //zip2 z20.s,z10.s,z14.s
+
+.inst 0x05a76075 //zip1 z21.s,z3.s,z7.s
+.inst 0x05a76476 //zip2 z22.s,z3.s,z7.s
+.inst 0x05af6177 //zip1 z23.s,z11.s,z15.s
+.inst 0x05af6578 //zip2 z24.s,z11.s,z15.s
+
+.inst 0x05f36222 //zip1 z2.d,z17.d,z19.d
+.inst 0x05f36626 //zip2 z6.d,z17.d,z19.d
+.inst 0x05f4624a //zip1 z10.d,z18.d,z20.d
+.inst 0x05f4664e //zip2 z14.d,z18.d,z20.d
+
+.inst 0x05f762a3 //zip1 z3.d,z21.d,z23.d
+.inst 0x05f766a7 //zip2 z7.d,z21.d,z23.d
+.inst 0x05f862cb //zip1 z11.d,z22.d,z24.d
+.inst 0x05f866cf //zip2 z15.d,z22.d,z24.d
+.if mixin == 1
+ eor x19,x19,x20
+.endif
+.if mixin == 1
+ eor x21,x21,x22
+.endif
+.inst 0xa540a031 //ld1w {z17.s},p0/z,[x1,#0,MUL VL]
+.inst 0xa541a032 //ld1w {z18.s},p0/z,[x1,#1,MUL VL]
+.inst 0xa542a033 //ld1w {z19.s},p0/z,[x1,#2,MUL VL]
+.inst 0xa543a034 //ld1w {z20.s},p0/z,[x1,#3,MUL VL]
+.inst 0xa544a035 //ld1w {z21.s},p0/z,[x1,#4,MUL VL]
+.inst 0xa545a036 //ld1w {z22.s},p0/z,[x1,#5,MUL VL]
+.inst 0xa546a037 //ld1w {z23.s},p0/z,[x1,#6,MUL VL]
+.inst 0xa547a038 //ld1w {z24.s},p0/z,[x1,#7,MUL VL]
+.inst 0x04215101 //addvl x1,x1,8
+.inst 0x04b13000 //eor z0.d,z0.d,z17.d
+.inst 0x04b23084 //eor z4.d,z4.d,z18.d
+.inst 0x04b33108 //eor z8.d,z8.d,z19.d
+.inst 0x04b4318c //eor z12.d,z12.d,z20.d
+.inst 0x04b53021 //eor z1.d,z1.d,z21.d
+.inst 0x04b630a5 //eor z5.d,z5.d,z22.d
+.inst 0x04b73129 //eor z9.d,z9.d,z23.d
+.inst 0x04b831ad //eor z13.d,z13.d,z24.d
+.inst 0xa540a031 //ld1w {z17.s},p0/z,[x1,#0,MUL VL]
+.inst 0xa541a032 //ld1w {z18.s},p0/z,[x1,#1,MUL VL]
+.inst 0xa542a033 //ld1w {z19.s},p0/z,[x1,#2,MUL VL]
+.inst 0xa543a034 //ld1w {z20.s},p0/z,[x1,#3,MUL VL]
+.inst 0xa544a035 //ld1w {z21.s},p0/z,[x1,#4,MUL VL]
+.inst 0xa545a036 //ld1w {z22.s},p0/z,[x1,#5,MUL VL]
+.inst 0xa546a037 //ld1w {z23.s},p0/z,[x1,#6,MUL VL]
+.inst 0xa547a038 //ld1w {z24.s},p0/z,[x1,#7,MUL VL]
+.inst 0x04215101 //addvl x1,x1,8
+.if mixin == 1
+ stp x7,x9,[x0],#16
+.endif
+.inst 0x04b13042 //eor z2.d,z2.d,z17.d
+.inst 0x04b230c6 //eor z6.d,z6.d,z18.d
+.if mixin == 1
+ stp x11,x13,[x0],#16
+.endif
+.inst 0x04b3314a //eor z10.d,z10.d,z19.d
+.inst 0x04b431ce //eor z14.d,z14.d,z20.d
+.if mixin == 1
+ stp x15,x17,[x0],#16
+.endif
+.inst 0x04b53063 //eor z3.d,z3.d,z21.d
+.inst 0x04b630e7 //eor z7.d,z7.d,z22.d
+.if mixin == 1
+ stp x19,x21,[x0],#16
+.endif
+.inst 0x04b7316b //eor z11.d,z11.d,z23.d
+.inst 0x04b831ef //eor z15.d,z15.d,z24.d
+.inst 0xe540e000 //st1w {z0.s},p0,[x0,#0,MUL VL]
+.inst 0xe541e004 //st1w {z4.s},p0,[x0,#1,MUL VL]
+.inst 0xe542e008 //st1w {z8.s},p0,[x0,#2,MUL VL]
+.inst 0xe543e00c //st1w {z12.s},p0,[x0,#3,MUL VL]
+.inst 0xe544e001 //st1w {z1.s},p0,[x0,#4,MUL VL]
+.inst 0xe545e005 //st1w {z5.s},p0,[x0,#5,MUL VL]
+.inst 0xe546e009 //st1w {z9.s},p0,[x0,#6,MUL VL]
+.inst 0xe547e00d //st1w {z13.s},p0,[x0,#7,MUL VL]
+.inst 0x04205100 //addvl x0,x0,8
+.inst 0xe540e002 //st1w {z2.s},p0,[x0,#0,MUL VL]
+.inst 0xe541e006 //st1w {z6.s},p0,[x0,#1,MUL VL]
+.inst 0xe542e00a //st1w {z10.s},p0,[x0,#2,MUL VL]
+.inst 0xe543e00e //st1w {z14.s},p0,[x0,#3,MUL VL]
+.inst 0xe544e003 //st1w {z3.s},p0,[x0,#4,MUL VL]
+.inst 0xe545e007 //st1w {z7.s},p0,[x0,#5,MUL VL]
+.inst 0xe546e00b //st1w {z11.s},p0,[x0,#6,MUL VL]
+.inst 0xe547e00f //st1w {z15.s},p0,[x0,#7,MUL VL]
+.inst 0x04205100 //addvl x0,x0,8
+210:
+.inst 0x04b0e3fd //incw x29, ALL, MUL #1
+ subs x2,x2,64
+ b.gt 100b
+ b 110f
+101:
+ mixin=0
+ lsr x8,x23,#32
+.inst 0x05a03ae0 //dup z0.s,w23
+.inst 0x05a03af9 //dup z25.s,w23
+.if mixin == 1
+ mov w7,w23
+.endif
+.inst 0x05a03904 //dup z4.s,w8
+.inst 0x05a0391a //dup z26.s,w8
+ lsr x10,x24,#32
+.inst 0x05a03b08 //dup z8.s,w24
+.inst 0x05a03b1b //dup z27.s,w24
+.if mixin == 1
+ mov w9,w24
+.endif
+.inst 0x05a0394c //dup z12.s,w10
+.inst 0x05a0395c //dup z28.s,w10
+ lsr x12,x25,#32
+.inst 0x05a03b21 //dup z1.s,w25
+.inst 0x05a03b3d //dup z29.s,w25
+.if mixin == 1
+ mov w11,w25
+.endif
+.inst 0x05a03985 //dup z5.s,w12
+.inst 0x05a0399e //dup z30.s,w12
+ lsr x14,x26,#32
+.inst 0x05a03b49 //dup z9.s,w26
+.inst 0x05a03b55 //dup z21.s,w26
+.if mixin == 1
+ mov w13,w26
+.endif
+.inst 0x05a039cd //dup z13.s,w14
+.inst 0x05a039d6 //dup z22.s,w14
+ lsr x16,x27,#32
+.inst 0x05a03b62 //dup z2.s,w27
+.inst 0x05a03b77 //dup z23.s,w27
+.if mixin == 1
+ mov w15,w27
+.endif
+.inst 0x05a03a06 //dup z6.s,w16
+.inst 0x05a03a18 //dup z24.s,w16
+ lsr x18,x28,#32
+.inst 0x05a03b8a //dup z10.s,w28
+.inst 0x05a03b91 //dup z17.s,w28
+.if mixin == 1
+ mov w17,w28
+.endif
+.inst 0x05a03a4e //dup z14.s,w18
+.inst 0x05a03a52 //dup z18.s,w18
+ lsr x22,x30,#32
+.inst 0x05a03bcb //dup z11.s,w30
+.inst 0x05a03bd4 //dup z20.s,w30
+.if mixin == 1
+ mov w21,w30
+.endif
+.inst 0x05a03acf //dup z15.s,w22
+.inst 0x05a03adf //dup z31.s,w22
+.if mixin == 1
+ add w20,w29,#1
+ mov w19,w29
+.inst 0x04a14690 //index z16.s,w20,1
+.inst 0x04a14683 //index z3.s,w20,1
+.else
+.inst 0x04a147b0 //index z16.s,w29,1
+.inst 0x04a147a3 //index z3.s,w29,1
+.endif
+ lsr x20,x29,#32
+.inst 0x05a03a87 //dup z7.s,w20
+.inst 0x05a03a93 //dup z19.s,w20
+ mov x6,#10
+10:
+.align 5
+.inst 0x04a10000 //add z0.s,z0.s,z1.s
+.if mixin == 1
+ add w7,w7,w11
+.endif
+.inst 0x04a50084 //add z4.s,z4.s,z5.s
+.if mixin == 1
+ add w8,w8,w12
+.endif
+.inst 0x04a90108 //add z8.s,z8.s,z9.s
+.if mixin == 1
+ add w9,w9,w13
+.endif
+.inst 0x04ad018c //add z12.s,z12.s,z13.s
+.if mixin == 1
+ add w10,w10,w14
+.endif
+.if mixin == 1
+ eor w19,w19,w7
+.endif
+.inst 0x04703403 //xar z3.s,z3.s,z0.s,16
+.if mixin == 1
+ ror w19,w19,16
+.endif
+.if mixin == 1
+ eor w20,w20,w8
+.endif
+.inst 0x04703487 //xar z7.s,z7.s,z4.s,16
+.if mixin == 1
+ ror w20,w20,16
+.endif
+.if mixin == 1
+ eor w21,w21,w9
+.endif
+.inst 0x0470350b //xar z11.s,z11.s,z8.s,16
+.if mixin == 1
+ ror w21,w21,16
+.endif
+.if mixin == 1
+ eor w22,w22,w10
+.endif
+.inst 0x0470358f //xar z15.s,z15.s,z12.s,16
+.if mixin == 1
+ ror w22,w22,16
+.endif
+.inst 0x04a30042 //add z2.s,z2.s,z3.s
+.if mixin == 1
+ add w15,w15,w19
+.endif
+.inst 0x04a700c6 //add z6.s,z6.s,z7.s
+.if mixin == 1
+ add w16,w16,w20
+.endif
+.inst 0x04ab014a //add z10.s,z10.s,z11.s
+.if mixin == 1
+ add w17,w17,w21
+.endif
+.inst 0x04af01ce //add z14.s,z14.s,z15.s
+.if mixin == 1
+ add w18,w18,w22
+.endif
+.if mixin == 1
+ eor w11,w11,w15
+.endif
+.inst 0x046c3441 //xar z1.s,z1.s,z2.s,20
+.if mixin == 1
+ ror w11,w11,20
+.endif
+.if mixin == 1
+ eor w12,w12,w16
+.endif
+.inst 0x046c34c5 //xar z5.s,z5.s,z6.s,20
+.if mixin == 1
+ ror w12,w12,20
+.endif
+.if mixin == 1
+ eor w13,w13,w17
+.endif
+.inst 0x046c3549 //xar z9.s,z9.s,z10.s,20
+.if mixin == 1
+ ror w13,w13,20
+.endif
+.if mixin == 1
+ eor w14,w14,w18
+.endif
+.inst 0x046c35cd //xar z13.s,z13.s,z14.s,20
+.if mixin == 1
+ ror w14,w14,20
+.endif
+.inst 0x04a10000 //add z0.s,z0.s,z1.s
+.if mixin == 1
+ add w7,w7,w11
+.endif
+.inst 0x04a50084 //add z4.s,z4.s,z5.s
+.if mixin == 1
+ add w8,w8,w12
+.endif
+.inst 0x04a90108 //add z8.s,z8.s,z9.s
+.if mixin == 1
+ add w9,w9,w13
+.endif
+.inst 0x04ad018c //add z12.s,z12.s,z13.s
+.if mixin == 1
+ add w10,w10,w14
+.endif
+.if mixin == 1
+ eor w19,w19,w7
+.endif
+.inst 0x04683403 //xar z3.s,z3.s,z0.s,24
+.if mixin == 1
+ ror w19,w19,24
+.endif
+.if mixin == 1
+ eor w20,w20,w8
+.endif
+.inst 0x04683487 //xar z7.s,z7.s,z4.s,24
+.if mixin == 1
+ ror w20,w20,24
+.endif
+.if mixin == 1
+ eor w21,w21,w9
+.endif
+.inst 0x0468350b //xar z11.s,z11.s,z8.s,24
+.if mixin == 1
+ ror w21,w21,24
+.endif
+.if mixin == 1
+ eor w22,w22,w10
+.endif
+.inst 0x0468358f //xar z15.s,z15.s,z12.s,24
+.if mixin == 1
+ ror w22,w22,24
+.endif
+.inst 0x04a30042 //add z2.s,z2.s,z3.s
+.if mixin == 1
+ add w15,w15,w19
+.endif
+.inst 0x04a700c6 //add z6.s,z6.s,z7.s
+.if mixin == 1
+ add w16,w16,w20
+.endif
+.inst 0x04ab014a //add z10.s,z10.s,z11.s
+.if mixin == 1
+ add w17,w17,w21
+.endif
+.inst 0x04af01ce //add z14.s,z14.s,z15.s
+.if mixin == 1
+ add w18,w18,w22
+.endif
+.if mixin == 1
+ eor w11,w11,w15
+.endif
+.inst 0x04673441 //xar z1.s,z1.s,z2.s,25
+.if mixin == 1
+ ror w11,w11,25
+.endif
+.if mixin == 1
+ eor w12,w12,w16
+.endif
+.inst 0x046734c5 //xar z5.s,z5.s,z6.s,25
+.if mixin == 1
+ ror w12,w12,25
+.endif
+.if mixin == 1
+ eor w13,w13,w17
+.endif
+.inst 0x04673549 //xar z9.s,z9.s,z10.s,25
+.if mixin == 1
+ ror w13,w13,25
+.endif
+.if mixin == 1
+ eor w14,w14,w18
+.endif
+.inst 0x046735cd //xar z13.s,z13.s,z14.s,25
+.if mixin == 1
+ ror w14,w14,25
+.endif
+.inst 0x04a50000 //add z0.s,z0.s,z5.s
+.if mixin == 1
+ add w7,w7,w12
+.endif
+.inst 0x04a90084 //add z4.s,z4.s,z9.s
+.if mixin == 1
+ add w8,w8,w13
+.endif
+.inst 0x04ad0108 //add z8.s,z8.s,z13.s
+.if mixin == 1
+ add w9,w9,w14
+.endif
+.inst 0x04a1018c //add z12.s,z12.s,z1.s
+.if mixin == 1
+ add w10,w10,w11
+.endif
+.if mixin == 1
+ eor w22,w22,w7
+.endif
+.inst 0x0470340f //xar z15.s,z15.s,z0.s,16
+.if mixin == 1
+ ror w22,w22,16
+.endif
+.if mixin == 1
+ eor w19,w19,w8
+.endif
+.inst 0x04703483 //xar z3.s,z3.s,z4.s,16
+.if mixin == 1
+ ror w19,w19,16
+.endif
+.if mixin == 1
+ eor w20,w20,w9
+.endif
+.inst 0x04703507 //xar z7.s,z7.s,z8.s,16
+.if mixin == 1
+ ror w20,w20,16
+.endif
+.if mixin == 1
+ eor w21,w21,w10
+.endif
+.inst 0x0470358b //xar z11.s,z11.s,z12.s,16
+.if mixin == 1
+ ror w21,w21,16
+.endif
+.inst 0x04af014a //add z10.s,z10.s,z15.s
+.if mixin == 1
+ add w17,w17,w22
+.endif
+.inst 0x04a301ce //add z14.s,z14.s,z3.s
+.if mixin == 1
+ add w18,w18,w19
+.endif
+.inst 0x04a70042 //add z2.s,z2.s,z7.s
+.if mixin == 1
+ add w15,w15,w20
+.endif
+.inst 0x04ab00c6 //add z6.s,z6.s,z11.s
+.if mixin == 1
+ add w16,w16,w21
+.endif
+.if mixin == 1
+ eor w12,w12,w17
+.endif
+.inst 0x046c3545 //xar z5.s,z5.s,z10.s,20
+.if mixin == 1
+ ror w12,w12,20
+.endif
+.if mixin == 1
+ eor w13,w13,w18
+.endif
+.inst 0x046c35c9 //xar z9.s,z9.s,z14.s,20
+.if mixin == 1
+ ror w13,w13,20
+.endif
+.if mixin == 1
+ eor w14,w14,w15
+.endif
+.inst 0x046c344d //xar z13.s,z13.s,z2.s,20
+.if mixin == 1
+ ror w14,w14,20
+.endif
+.if mixin == 1
+ eor w11,w11,w16
+.endif
+.inst 0x046c34c1 //xar z1.s,z1.s,z6.s,20
+.if mixin == 1
+ ror w11,w11,20
+.endif
+.inst 0x04a50000 //add z0.s,z0.s,z5.s
+.if mixin == 1
+ add w7,w7,w12
+.endif
+.inst 0x04a90084 //add z4.s,z4.s,z9.s
+.if mixin == 1
+ add w8,w8,w13
+.endif
+.inst 0x04ad0108 //add z8.s,z8.s,z13.s
+.if mixin == 1
+ add w9,w9,w14
+.endif
+.inst 0x04a1018c //add z12.s,z12.s,z1.s
+.if mixin == 1
+ add w10,w10,w11
+.endif
+.if mixin == 1
+ eor w22,w22,w7
+.endif
+.inst 0x0468340f //xar z15.s,z15.s,z0.s,24
+.if mixin == 1
+ ror w22,w22,24
+.endif
+.if mixin == 1
+ eor w19,w19,w8
+.endif
+.inst 0x04683483 //xar z3.s,z3.s,z4.s,24
+.if mixin == 1
+ ror w19,w19,24
+.endif
+.if mixin == 1
+ eor w20,w20,w9
+.endif
+.inst 0x04683507 //xar z7.s,z7.s,z8.s,24
+.if mixin == 1
+ ror w20,w20,24
+.endif
+.if mixin == 1
+ eor w21,w21,w10
+.endif
+.inst 0x0468358b //xar z11.s,z11.s,z12.s,24
+.if mixin == 1
+ ror w21,w21,24
+.endif
+.inst 0x04af014a //add z10.s,z10.s,z15.s
+.if mixin == 1
+ add w17,w17,w22
+.endif
+.inst 0x04a301ce //add z14.s,z14.s,z3.s
+.if mixin == 1
+ add w18,w18,w19
+.endif
+.inst 0x04a70042 //add z2.s,z2.s,z7.s
+.if mixin == 1
+ add w15,w15,w20
+.endif
+.inst 0x04ab00c6 //add z6.s,z6.s,z11.s
+.if mixin == 1
+ add w16,w16,w21
+.endif
+.if mixin == 1
+ eor w12,w12,w17
+.endif
+.inst 0x04673545 //xar z5.s,z5.s,z10.s,25
+.if mixin == 1
+ ror w12,w12,25
+.endif
+.if mixin == 1
+ eor w13,w13,w18
+.endif
+.inst 0x046735c9 //xar z9.s,z9.s,z14.s,25
+.if mixin == 1
+ ror w13,w13,25
+.endif
+.if mixin == 1
+ eor w14,w14,w15
+.endif
+.inst 0x0467344d //xar z13.s,z13.s,z2.s,25
+.if mixin == 1
+ ror w14,w14,25
+.endif
+.if mixin == 1
+ eor w11,w11,w16
+.endif
+.inst 0x046734c1 //xar z1.s,z1.s,z6.s,25
+.if mixin == 1
+ ror w11,w11,25
+.endif
+ sub x6,x6,1
+ cbnz x6,10b
+.if mixin == 1
+ add w7,w7,w23
+.endif
+.inst 0x04b90000 //add z0.s,z0.s,z25.s
+.if mixin == 1
+ add x8,x8,x23,lsr #32
+.endif
+.inst 0x04ba0084 //add z4.s,z4.s,z26.s
+.if mixin == 1
+ add x7,x7,x8,lsl #32 // pack
+.endif
+.if mixin == 1
+ add w9,w9,w24
+.endif
+.inst 0x04bb0108 //add z8.s,z8.s,z27.s
+.if mixin == 1
+ add x10,x10,x24,lsr #32
+.endif
+.inst 0x04bc018c //add z12.s,z12.s,z28.s
+.if mixin == 1
+ add x9,x9,x10,lsl #32 // pack
+.endif
+.if mixin == 1
+ ldp x8,x10,[x1],#16
+.endif
+.if mixin == 1
+ add w11,w11,w25
+.endif
+.inst 0x04bd0021 //add z1.s,z1.s,z29.s
+.if mixin == 1
+ add x12,x12,x25,lsr #32
+.endif
+.inst 0x04be00a5 //add z5.s,z5.s,z30.s
+.if mixin == 1
+ add x11,x11,x12,lsl #32 // pack
+.endif
+.if mixin == 1
+ add w13,w13,w26
+.endif
+.inst 0x04b50129 //add z9.s,z9.s,z21.s
+.if mixin == 1
+ add x14,x14,x26,lsr #32
+.endif
+.inst 0x04b601ad //add z13.s,z13.s,z22.s
+.if mixin == 1
+ add x13,x13,x14,lsl #32 // pack
+.endif
+.if mixin == 1
+ ldp x12,x14,[x1],#16
+.endif
+.if mixin == 1
+ add w15,w15,w27
+.endif
+.inst 0x04b70042 //add z2.s,z2.s,z23.s
+.if mixin == 1
+ add x16,x16,x27,lsr #32
+.endif
+.inst 0x04b800c6 //add z6.s,z6.s,z24.s
+.if mixin == 1
+ add x15,x15,x16,lsl #32 // pack
+.endif
+.if mixin == 1
+ add w17,w17,w28
+.endif
+.inst 0x04b1014a //add z10.s,z10.s,z17.s
+.if mixin == 1
+ add x18,x18,x28,lsr #32
+.endif
+.inst 0x04b201ce //add z14.s,z14.s,z18.s
+.if mixin == 1
+ add x17,x17,x18,lsl #32 // pack
+.endif
+.if mixin == 1
+ ldp x16,x18,[x1],#16
+.endif
+.if mixin == 1
+ add w19,w19,w29
+.endif
+.inst 0x04b00063 //add z3.s,z3.s,z16.s
+.if mixin == 1
+ add x20,x20,x29,lsr #32
+.endif
+.inst 0x04b300e7 //add z7.s,z7.s,z19.s
+.if mixin == 1
+ add x19,x19,x20,lsl #32 // pack
+.endif
+.if mixin == 1
+ add w21,w21,w30
+.endif
+.inst 0x04b4016b //add z11.s,z11.s,z20.s
+.if mixin == 1
+ add x22,x22,x30,lsr #32
+.endif
+.inst 0x04bf01ef //add z15.s,z15.s,z31.s
+.if mixin == 1
+ add x21,x21,x22,lsl #32 // pack
+.endif
+.if mixin == 1
+ ldp x20,x22,[x1],#16
+.endif
+#ifdef __AARCH64EB__
+ rev x7,x7
+.inst 0x05a48000 //revb z0.s,p0/m,z0.s
+.inst 0x05a48084 //revb z4.s,p0/m,z4.s
+ rev x9,x9
+.inst 0x05a48108 //revb z8.s,p0/m,z8.s
+.inst 0x05a4818c //revb z12.s,p0/m,z12.s
+ rev x11,x11
+.inst 0x05a48021 //revb z1.s,p0/m,z1.s
+.inst 0x05a480a5 //revb z5.s,p0/m,z5.s
+ rev x13,x13
+.inst 0x05a48129 //revb z9.s,p0/m,z9.s
+.inst 0x05a481ad //revb z13.s,p0/m,z13.s
+ rev x15,x15
+.inst 0x05a48042 //revb z2.s,p0/m,z2.s
+.inst 0x05a480c6 //revb z6.s,p0/m,z6.s
+ rev x17,x17
+.inst 0x05a4814a //revb z10.s,p0/m,z10.s
+.inst 0x05a481ce //revb z14.s,p0/m,z14.s
+ rev x19,x19
+.inst 0x05a48063 //revb z3.s,p0/m,z3.s
+.inst 0x05a480e7 //revb z7.s,p0/m,z7.s
+ rev x21,x21
+.inst 0x05a4816b //revb z11.s,p0/m,z11.s
+.inst 0x05a481ef //revb z15.s,p0/m,z15.s
+#endif
+.if mixin == 1
+ add x29,x29,#1
+.endif
+ cmp x5,4
+ b.ne 200f
+.if mixin == 1
+ eor x7,x7,x8
+.endif
+.if mixin == 1
+ eor x9,x9,x10
+.endif
+.if mixin == 1
+ eor x11,x11,x12
+.endif
+.inst 0x05a46011 //zip1 z17.s,z0.s,z4.s
+.inst 0x05a46412 //zip2 z18.s,z0.s,z4.s
+.inst 0x05ac6113 //zip1 z19.s,z8.s,z12.s
+.inst 0x05ac6514 //zip2 z20.s,z8.s,z12.s
+
+.inst 0x05a56035 //zip1 z21.s,z1.s,z5.s
+.inst 0x05a56436 //zip2 z22.s,z1.s,z5.s
+.inst 0x05ad6137 //zip1 z23.s,z9.s,z13.s
+.inst 0x05ad6538 //zip2 z24.s,z9.s,z13.s
+
+.inst 0x05f36220 //zip1 z0.d,z17.d,z19.d
+.inst 0x05f36624 //zip2 z4.d,z17.d,z19.d
+.inst 0x05f46248 //zip1 z8.d,z18.d,z20.d
+.inst 0x05f4664c //zip2 z12.d,z18.d,z20.d
+
+.inst 0x05f762a1 //zip1 z1.d,z21.d,z23.d
+.inst 0x05f766a5 //zip2 z5.d,z21.d,z23.d
+.inst 0x05f862c9 //zip1 z9.d,z22.d,z24.d
+.inst 0x05f866cd //zip2 z13.d,z22.d,z24.d
+.if mixin == 1
+ eor x13,x13,x14
+.endif
+.if mixin == 1
+ eor x15,x15,x16
+.endif
+.if mixin == 1
+ eor x17,x17,x18
+.endif
+.inst 0x05a66051 //zip1 z17.s,z2.s,z6.s
+.inst 0x05a66452 //zip2 z18.s,z2.s,z6.s
+.inst 0x05ae6153 //zip1 z19.s,z10.s,z14.s
+.inst 0x05ae6554 //zip2 z20.s,z10.s,z14.s
+
+.inst 0x05a76075 //zip1 z21.s,z3.s,z7.s
+.inst 0x05a76476 //zip2 z22.s,z3.s,z7.s
+.inst 0x05af6177 //zip1 z23.s,z11.s,z15.s
+.inst 0x05af6578 //zip2 z24.s,z11.s,z15.s
+
+.inst 0x05f36222 //zip1 z2.d,z17.d,z19.d
+.inst 0x05f36626 //zip2 z6.d,z17.d,z19.d
+.inst 0x05f4624a //zip1 z10.d,z18.d,z20.d
+.inst 0x05f4664e //zip2 z14.d,z18.d,z20.d
+
+.inst 0x05f762a3 //zip1 z3.d,z21.d,z23.d
+.inst 0x05f766a7 //zip2 z7.d,z21.d,z23.d
+.inst 0x05f862cb //zip1 z11.d,z22.d,z24.d
+.inst 0x05f866cf //zip2 z15.d,z22.d,z24.d
+.if mixin == 1
+ eor x19,x19,x20
+.endif
+.if mixin == 1
+ eor x21,x21,x22
+.endif
+ ld1 {v17.4s,v18.4s,v19.4s,v20.4s},[x1],#64
+ ld1 {v21.4s,v22.4s,v23.4s,v24.4s},[x1],#64
+.inst 0x04b13000 //eor z0.d,z0.d,z17.d
+.inst 0x04b23021 //eor z1.d,z1.d,z18.d
+.inst 0x04b33042 //eor z2.d,z2.d,z19.d
+.inst 0x04b43063 //eor z3.d,z3.d,z20.d
+.inst 0x04b53084 //eor z4.d,z4.d,z21.d
+.inst 0x04b630a5 //eor z5.d,z5.d,z22.d
+.inst 0x04b730c6 //eor z6.d,z6.d,z23.d
+.inst 0x04b830e7 //eor z7.d,z7.d,z24.d
+ ld1 {v17.4s,v18.4s,v19.4s,v20.4s},[x1],#64
+ ld1 {v21.4s,v22.4s,v23.4s,v24.4s},[x1],#64
+.if mixin == 1
+ stp x7,x9,[x0],#16
+.endif
+.inst 0x04b13108 //eor z8.d,z8.d,z17.d
+.inst 0x04b23129 //eor z9.d,z9.d,z18.d
+.if mixin == 1
+ stp x11,x13,[x0],#16
+.endif
+.inst 0x04b3314a //eor z10.d,z10.d,z19.d
+.inst 0x04b4316b //eor z11.d,z11.d,z20.d
+.if mixin == 1
+ stp x15,x17,[x0],#16
+.endif
+.inst 0x04b5318c //eor z12.d,z12.d,z21.d
+.inst 0x04b631ad //eor z13.d,z13.d,z22.d
+.if mixin == 1
+ stp x19,x21,[x0],#16
+.endif
+.inst 0x04b731ce //eor z14.d,z14.d,z23.d
+.inst 0x04b831ef //eor z15.d,z15.d,z24.d
+ st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x0],#64
+ st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
+ st1 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64
+ st1 {v12.4s,v13.4s,v14.4s,v15.4s},[x0],#64
+ b 210f
+200:
+.inst 0x05a16011 //zip1 z17.s,z0.s,z1.s
+.inst 0x05a16412 //zip2 z18.s,z0.s,z1.s
+.inst 0x05a36053 //zip1 z19.s,z2.s,z3.s
+.inst 0x05a36454 //zip2 z20.s,z2.s,z3.s
+
+.inst 0x05a56095 //zip1 z21.s,z4.s,z5.s
+.inst 0x05a56496 //zip2 z22.s,z4.s,z5.s
+.inst 0x05a760d7 //zip1 z23.s,z6.s,z7.s
+.inst 0x05a764d8 //zip2 z24.s,z6.s,z7.s
+
+.inst 0x05f36220 //zip1 z0.d,z17.d,z19.d
+.inst 0x05f36621 //zip2 z1.d,z17.d,z19.d
+.inst 0x05f46242 //zip1 z2.d,z18.d,z20.d
+.inst 0x05f46643 //zip2 z3.d,z18.d,z20.d
+
+.inst 0x05f762a4 //zip1 z4.d,z21.d,z23.d
+.inst 0x05f766a5 //zip2 z5.d,z21.d,z23.d
+.inst 0x05f862c6 //zip1 z6.d,z22.d,z24.d
+.inst 0x05f866c7 //zip2 z7.d,z22.d,z24.d
+.if mixin == 1
+ eor x7,x7,x8
+.endif
+.if mixin == 1
+ eor x9,x9,x10
+.endif
+.inst 0x05a96111 //zip1 z17.s,z8.s,z9.s
+.inst 0x05a96512 //zip2 z18.s,z8.s,z9.s
+.inst 0x05ab6153 //zip1 z19.s,z10.s,z11.s
+.inst 0x05ab6554 //zip2 z20.s,z10.s,z11.s
+
+.inst 0x05ad6195 //zip1 z21.s,z12.s,z13.s
+.inst 0x05ad6596 //zip2 z22.s,z12.s,z13.s
+.inst 0x05af61d7 //zip1 z23.s,z14.s,z15.s
+.inst 0x05af65d8 //zip2 z24.s,z14.s,z15.s
+
+.inst 0x05f36228 //zip1 z8.d,z17.d,z19.d
+.inst 0x05f36629 //zip2 z9.d,z17.d,z19.d
+.inst 0x05f4624a //zip1 z10.d,z18.d,z20.d
+.inst 0x05f4664b //zip2 z11.d,z18.d,z20.d
+
+.inst 0x05f762ac //zip1 z12.d,z21.d,z23.d
+.inst 0x05f766ad //zip2 z13.d,z21.d,z23.d
+.inst 0x05f862ce //zip1 z14.d,z22.d,z24.d
+.inst 0x05f866cf //zip2 z15.d,z22.d,z24.d
+.if mixin == 1
+ eor x11,x11,x12
+.endif
+.if mixin == 1
+ eor x13,x13,x14
+.endif
+.inst 0x05a46011 //zip1 z17.s,z0.s,z4.s
+.inst 0x05a46412 //zip2 z18.s,z0.s,z4.s
+.inst 0x05ac6113 //zip1 z19.s,z8.s,z12.s
+.inst 0x05ac6514 //zip2 z20.s,z8.s,z12.s
+
+.inst 0x05a56035 //zip1 z21.s,z1.s,z5.s
+.inst 0x05a56436 //zip2 z22.s,z1.s,z5.s
+.inst 0x05ad6137 //zip1 z23.s,z9.s,z13.s
+.inst 0x05ad6538 //zip2 z24.s,z9.s,z13.s
+
+.inst 0x05f36220 //zip1 z0.d,z17.d,z19.d
+.inst 0x05f36624 //zip2 z4.d,z17.d,z19.d
+.inst 0x05f46248 //zip1 z8.d,z18.d,z20.d
+.inst 0x05f4664c //zip2 z12.d,z18.d,z20.d
+
+.inst 0x05f762a1 //zip1 z1.d,z21.d,z23.d
+.inst 0x05f766a5 //zip2 z5.d,z21.d,z23.d
+.inst 0x05f862c9 //zip1 z9.d,z22.d,z24.d
+.inst 0x05f866cd //zip2 z13.d,z22.d,z24.d
+.if mixin == 1
+ eor x15,x15,x16
+.endif
+.if mixin == 1
+ eor x17,x17,x18
+.endif
+.inst 0x05a66051 //zip1 z17.s,z2.s,z6.s
+.inst 0x05a66452 //zip2 z18.s,z2.s,z6.s
+.inst 0x05ae6153 //zip1 z19.s,z10.s,z14.s
+.inst 0x05ae6554 //zip2 z20.s,z10.s,z14.s
+
+.inst 0x05a76075 //zip1 z21.s,z3.s,z7.s
+.inst 0x05a76476 //zip2 z22.s,z3.s,z7.s
+.inst 0x05af6177 //zip1 z23.s,z11.s,z15.s
+.inst 0x05af6578 //zip2 z24.s,z11.s,z15.s
+
+.inst 0x05f36222 //zip1 z2.d,z17.d,z19.d
+.inst 0x05f36626 //zip2 z6.d,z17.d,z19.d
+.inst 0x05f4624a //zip1 z10.d,z18.d,z20.d
+.inst 0x05f4664e //zip2 z14.d,z18.d,z20.d
+
+.inst 0x05f762a3 //zip1 z3.d,z21.d,z23.d
+.inst 0x05f766a7 //zip2 z7.d,z21.d,z23.d
+.inst 0x05f862cb //zip1 z11.d,z22.d,z24.d
+.inst 0x05f866cf //zip2 z15.d,z22.d,z24.d
+.if mixin == 1
+ eor x19,x19,x20
+.endif
+.if mixin == 1
+ eor x21,x21,x22
+.endif
+.inst 0xa540a031 //ld1w {z17.s},p0/z,[x1,#0,MUL VL]
+.inst 0xa541a032 //ld1w {z18.s},p0/z,[x1,#1,MUL VL]
+.inst 0xa542a033 //ld1w {z19.s},p0/z,[x1,#2,MUL VL]
+.inst 0xa543a034 //ld1w {z20.s},p0/z,[x1,#3,MUL VL]
+.inst 0xa544a035 //ld1w {z21.s},p0/z,[x1,#4,MUL VL]
+.inst 0xa545a036 //ld1w {z22.s},p0/z,[x1,#5,MUL VL]
+.inst 0xa546a037 //ld1w {z23.s},p0/z,[x1,#6,MUL VL]
+.inst 0xa547a038 //ld1w {z24.s},p0/z,[x1,#7,MUL VL]
+.inst 0x04215101 //addvl x1,x1,8
+.inst 0x04b13000 //eor z0.d,z0.d,z17.d
+.inst 0x04b23084 //eor z4.d,z4.d,z18.d
+.inst 0x04b33108 //eor z8.d,z8.d,z19.d
+.inst 0x04b4318c //eor z12.d,z12.d,z20.d
+.inst 0x04b53021 //eor z1.d,z1.d,z21.d
+.inst 0x04b630a5 //eor z5.d,z5.d,z22.d
+.inst 0x04b73129 //eor z9.d,z9.d,z23.d
+.inst 0x04b831ad //eor z13.d,z13.d,z24.d
+.inst 0xa540a031 //ld1w {z17.s},p0/z,[x1,#0,MUL VL]
+.inst 0xa541a032 //ld1w {z18.s},p0/z,[x1,#1,MUL VL]
+.inst 0xa542a033 //ld1w {z19.s},p0/z,[x1,#2,MUL VL]
+.inst 0xa543a034 //ld1w {z20.s},p0/z,[x1,#3,MUL VL]
+.inst 0xa544a035 //ld1w {z21.s},p0/z,[x1,#4,MUL VL]
+.inst 0xa545a036 //ld1w {z22.s},p0/z,[x1,#5,MUL VL]
+.inst 0xa546a037 //ld1w {z23.s},p0/z,[x1,#6,MUL VL]
+.inst 0xa547a038 //ld1w {z24.s},p0/z,[x1,#7,MUL VL]
+.inst 0x04215101 //addvl x1,x1,8
+.if mixin == 1
+ stp x7,x9,[x0],#16
+.endif
+.inst 0x04b13042 //eor z2.d,z2.d,z17.d
+.inst 0x04b230c6 //eor z6.d,z6.d,z18.d
+.if mixin == 1
+ stp x11,x13,[x0],#16
+.endif
+.inst 0x04b3314a //eor z10.d,z10.d,z19.d
+.inst 0x04b431ce //eor z14.d,z14.d,z20.d
+.if mixin == 1
+ stp x15,x17,[x0],#16
+.endif
+.inst 0x04b53063 //eor z3.d,z3.d,z21.d
+.inst 0x04b630e7 //eor z7.d,z7.d,z22.d
+.if mixin == 1
+ stp x19,x21,[x0],#16
+.endif
+.inst 0x04b7316b //eor z11.d,z11.d,z23.d
+.inst 0x04b831ef //eor z15.d,z15.d,z24.d
+.inst 0xe540e000 //st1w {z0.s},p0,[x0,#0,MUL VL]
+.inst 0xe541e004 //st1w {z4.s},p0,[x0,#1,MUL VL]
+.inst 0xe542e008 //st1w {z8.s},p0,[x0,#2,MUL VL]
+.inst 0xe543e00c //st1w {z12.s},p0,[x0,#3,MUL VL]
+.inst 0xe544e001 //st1w {z1.s},p0,[x0,#4,MUL VL]
+.inst 0xe545e005 //st1w {z5.s},p0,[x0,#5,MUL VL]
+.inst 0xe546e009 //st1w {z9.s},p0,[x0,#6,MUL VL]
+.inst 0xe547e00d //st1w {z13.s},p0,[x0,#7,MUL VL]
+.inst 0x04205100 //addvl x0,x0,8
+.inst 0xe540e002 //st1w {z2.s},p0,[x0,#0,MUL VL]
+.inst 0xe541e006 //st1w {z6.s},p0,[x0,#1,MUL VL]
+.inst 0xe542e00a //st1w {z10.s},p0,[x0,#2,MUL VL]
+.inst 0xe543e00e //st1w {z14.s},p0,[x0,#3,MUL VL]
+.inst 0xe544e003 //st1w {z3.s},p0,[x0,#4,MUL VL]
+.inst 0xe545e007 //st1w {z7.s},p0,[x0,#5,MUL VL]
+.inst 0xe546e00b //st1w {z11.s},p0,[x0,#6,MUL VL]
+.inst 0xe547e00f //st1w {z15.s},p0,[x0,#7,MUL VL]
+.inst 0x04205100 //addvl x0,x0,8
+210:
+.inst 0x04b0e3fd //incw x29, ALL, MUL #1
+110:
+ b 2f
+1:
+.align 5
+100:
+ subs x7,x2,x5,lsl #6
+ b.lt 110f
+ mov x2,x7
+ b.eq 101f
+ cmp x2,64
+ b.lt 101f
+ mixin=1
+ lsr x8,x23,#32
+.inst 0x05a03ae0 //dup z0.s,w23
+.inst 0x05a03af9 //dup z25.s,w23
+.if mixin == 1
+ mov w7,w23
+.endif
+.inst 0x05a03904 //dup z4.s,w8
+.inst 0x05a0391a //dup z26.s,w8
+ lsr x10,x24,#32
+.inst 0x05a03b08 //dup z8.s,w24
+.inst 0x05a03b1b //dup z27.s,w24
+.if mixin == 1
+ mov w9,w24
+.endif
+.inst 0x05a0394c //dup z12.s,w10
+.inst 0x05a0395c //dup z28.s,w10
+ lsr x12,x25,#32
+.inst 0x05a03b21 //dup z1.s,w25
+.inst 0x05a03b3d //dup z29.s,w25
+.if mixin == 1
+ mov w11,w25
+.endif
+.inst 0x05a03985 //dup z5.s,w12
+.inst 0x05a0399e //dup z30.s,w12
+ lsr x14,x26,#32
+.inst 0x05a03b49 //dup z9.s,w26
+.inst 0x05a03b55 //dup z21.s,w26
+.if mixin == 1
+ mov w13,w26
+.endif
+.inst 0x05a039cd //dup z13.s,w14
+.inst 0x05a039d6 //dup z22.s,w14
+ lsr x16,x27,#32
+.inst 0x05a03b62 //dup z2.s,w27
+.inst 0x05a03b77 //dup z23.s,w27
+.if mixin == 1
+ mov w15,w27
+.endif
+.inst 0x05a03a06 //dup z6.s,w16
+.inst 0x05a03a18 //dup z24.s,w16
+ lsr x18,x28,#32
+.inst 0x05a03b8a //dup z10.s,w28
+.if mixin == 1
+ mov w17,w28
+.endif
+.inst 0x05a03a4e //dup z14.s,w18
+ lsr x22,x30,#32
+.inst 0x05a03bcb //dup z11.s,w30
+.if mixin == 1
+ mov w21,w30
+.endif
+.inst 0x05a03acf //dup z15.s,w22
+.if mixin == 1
+ add w20,w29,#1
+ mov w19,w29
+.inst 0x04a14690 //index z16.s,w20,1
+.inst 0x04a14683 //index z3.s,w20,1
+.else
+.inst 0x04a147b0 //index z16.s,w29,1
+.inst 0x04a147a3 //index z3.s,w29,1
+.endif
+ lsr x20,x29,#32
+.inst 0x05a03a87 //dup z7.s,w20
+ mov x6,#10
+10:
+.align 5
+.inst 0x04a10000 //add z0.s,z0.s,z1.s
+.if mixin == 1
+ add w7,w7,w11
+.endif
+.inst 0x04a50084 //add z4.s,z4.s,z5.s
+.if mixin == 1
+ add w8,w8,w12
+.endif
+.inst 0x04a90108 //add z8.s,z8.s,z9.s
+.if mixin == 1
+ add w9,w9,w13
+.endif
+.inst 0x04ad018c //add z12.s,z12.s,z13.s
+.if mixin == 1
+ add w10,w10,w14
+.endif
+.inst 0x04a03063 //eor z3.d,z3.d,z0.d
+.if mixin == 1
+ eor w19,w19,w7
+.endif
+.inst 0x04a430e7 //eor z7.d,z7.d,z4.d
+.if mixin == 1
+ eor w20,w20,w8
+.endif
+.inst 0x04a8316b //eor z11.d,z11.d,z8.d
+.if mixin == 1
+ eor w21,w21,w9
+.endif
+.inst 0x04ac31ef //eor z15.d,z15.d,z12.d
+.if mixin == 1
+ eor w22,w22,w10
+.endif
+.inst 0x05a58063 //revh z3.s,p0/m,z3.s
+.if mixin == 1
+ ror w19,w19,#16
+.endif
+.inst 0x05a580e7 //revh z7.s,p0/m,z7.s
+.if mixin == 1
+ ror w20,w20,#16
+.endif
+.inst 0x05a5816b //revh z11.s,p0/m,z11.s
+.if mixin == 1
+ ror w21,w21,#16
+.endif
+.inst 0x05a581ef //revh z15.s,p0/m,z15.s
+.if mixin == 1
+ ror w22,w22,#16
+.endif
+.inst 0x04a30042 //add z2.s,z2.s,z3.s
+.if mixin == 1
+ add w15,w15,w19
+.endif
+.inst 0x04a700c6 //add z6.s,z6.s,z7.s
+.if mixin == 1
+ add w16,w16,w20
+.endif
+.inst 0x04ab014a //add z10.s,z10.s,z11.s
+.if mixin == 1
+ add w17,w17,w21
+.endif
+.inst 0x04af01ce //add z14.s,z14.s,z15.s
+.if mixin == 1
+ add w18,w18,w22
+.endif
+.inst 0x04a23021 //eor z1.d,z1.d,z2.d
+.if mixin == 1
+ eor w11,w11,w15
+.endif
+.inst 0x04a630a5 //eor z5.d,z5.d,z6.d
+.if mixin == 1
+ eor w12,w12,w16
+.endif
+.inst 0x04aa3129 //eor z9.d,z9.d,z10.d
+.if mixin == 1
+ eor w13,w13,w17
+.endif
+.inst 0x04ae31ad //eor z13.d,z13.d,z14.d
+.if mixin == 1
+ eor w14,w14,w18
+.endif
+.inst 0x046c9c31 //lsl z17.s,z1.s,12
+.inst 0x046c9cb2 //lsl z18.s,z5.s,12
+.inst 0x046c9d33 //lsl z19.s,z9.s,12
+.inst 0x046c9db4 //lsl z20.s,z13.s,12
+.inst 0x046c9421 //lsr z1.s,z1.s,20
+.if mixin == 1
+ ror w11,w11,20
+.endif
+.inst 0x046c94a5 //lsr z5.s,z5.s,20
+.if mixin == 1
+ ror w12,w12,20
+.endif
+.inst 0x046c9529 //lsr z9.s,z9.s,20
+.if mixin == 1
+ ror w13,w13,20
+.endif
+.inst 0x046c95ad //lsr z13.s,z13.s,20
+.if mixin == 1
+ ror w14,w14,20
+.endif
+.inst 0x04713021 //orr z1.d,z1.d,z17.d
+.inst 0x047230a5 //orr z5.d,z5.d,z18.d
+.inst 0x04733129 //orr z9.d,z9.d,z19.d
+.inst 0x047431ad //orr z13.d,z13.d,z20.d
+.inst 0x04a10000 //add z0.s,z0.s,z1.s
+.if mixin == 1
+ add w7,w7,w11
+.endif
+.inst 0x04a50084 //add z4.s,z4.s,z5.s
+.if mixin == 1
+ add w8,w8,w12
+.endif
+.inst 0x04a90108 //add z8.s,z8.s,z9.s
+.if mixin == 1
+ add w9,w9,w13
+.endif
+.inst 0x04ad018c //add z12.s,z12.s,z13.s
+.if mixin == 1
+ add w10,w10,w14
+.endif
+.inst 0x04a03063 //eor z3.d,z3.d,z0.d
+.if mixin == 1
+ eor w19,w19,w7
+.endif
+.inst 0x04a430e7 //eor z7.d,z7.d,z4.d
+.if mixin == 1
+ eor w20,w20,w8
+.endif
+.inst 0x04a8316b //eor z11.d,z11.d,z8.d
+.if mixin == 1
+ eor w21,w21,w9
+.endif
+.inst 0x04ac31ef //eor z15.d,z15.d,z12.d
+.if mixin == 1
+ eor w22,w22,w10
+.endif
+.inst 0x053f3063 //tbl z3.b,{z3.b},z31.b
+.if mixin == 1
+ ror w19,w19,#24
+.endif
+.inst 0x053f30e7 //tbl z7.b,{z7.b},z31.b
+.if mixin == 1
+ ror w20,w20,#24
+.endif
+.inst 0x053f316b //tbl z11.b,{z11.b},z31.b
+.if mixin == 1
+ ror w21,w21,#24
+.endif
+.inst 0x053f31ef //tbl z15.b,{z15.b},z31.b
+.if mixin == 1
+ ror w22,w22,#24
+.endif
+.inst 0x04a30042 //add z2.s,z2.s,z3.s
+.if mixin == 1
+ add w15,w15,w19
+.endif
+.inst 0x04a700c6 //add z6.s,z6.s,z7.s
+.if mixin == 1
+ add w16,w16,w20
+.endif
+.inst 0x04ab014a //add z10.s,z10.s,z11.s
+.if mixin == 1
+ add w17,w17,w21
+.endif
+.inst 0x04af01ce //add z14.s,z14.s,z15.s
+.if mixin == 1
+ add w18,w18,w22
+.endif
+.inst 0x04a23021 //eor z1.d,z1.d,z2.d
+.if mixin == 1
+ eor w11,w11,w15
+.endif
+.inst 0x04a630a5 //eor z5.d,z5.d,z6.d
+.if mixin == 1
+ eor w12,w12,w16
+.endif
+.inst 0x04aa3129 //eor z9.d,z9.d,z10.d
+.if mixin == 1
+ eor w13,w13,w17
+.endif
+.inst 0x04ae31ad //eor z13.d,z13.d,z14.d
+.if mixin == 1
+ eor w14,w14,w18
+.endif
+.inst 0x04679c31 //lsl z17.s,z1.s,7
+.inst 0x04679cb2 //lsl z18.s,z5.s,7
+.inst 0x04679d33 //lsl z19.s,z9.s,7
+.inst 0x04679db4 //lsl z20.s,z13.s,7
+.inst 0x04679421 //lsr z1.s,z1.s,25
+.if mixin == 1
+ ror w11,w11,25
+.endif
+.inst 0x046794a5 //lsr z5.s,z5.s,25
+.if mixin == 1
+ ror w12,w12,25
+.endif
+.inst 0x04679529 //lsr z9.s,z9.s,25
+.if mixin == 1
+ ror w13,w13,25
+.endif
+.inst 0x046795ad //lsr z13.s,z13.s,25
+.if mixin == 1
+ ror w14,w14,25
+.endif
+.inst 0x04713021 //orr z1.d,z1.d,z17.d
+.inst 0x047230a5 //orr z5.d,z5.d,z18.d
+.inst 0x04733129 //orr z9.d,z9.d,z19.d
+.inst 0x047431ad //orr z13.d,z13.d,z20.d
+.inst 0x04a50000 //add z0.s,z0.s,z5.s
+.if mixin == 1
+ add w7,w7,w12
+.endif
+.inst 0x04a90084 //add z4.s,z4.s,z9.s
+.if mixin == 1
+ add w8,w8,w13
+.endif
+.inst 0x04ad0108 //add z8.s,z8.s,z13.s
+.if mixin == 1
+ add w9,w9,w14
+.endif
+.inst 0x04a1018c //add z12.s,z12.s,z1.s
+.if mixin == 1
+ add w10,w10,w11
+.endif
+.inst 0x04a031ef //eor z15.d,z15.d,z0.d
+.if mixin == 1
+ eor w22,w22,w7
+.endif
+.inst 0x04a43063 //eor z3.d,z3.d,z4.d
+.if mixin == 1
+ eor w19,w19,w8
+.endif
+.inst 0x04a830e7 //eor z7.d,z7.d,z8.d
+.if mixin == 1
+ eor w20,w20,w9
+.endif
+.inst 0x04ac316b //eor z11.d,z11.d,z12.d
+.if mixin == 1
+ eor w21,w21,w10
+.endif
+.inst 0x05a581ef //revh z15.s,p0/m,z15.s
+.if mixin == 1
+ ror w22,w22,#16
+.endif
+.inst 0x05a58063 //revh z3.s,p0/m,z3.s
+.if mixin == 1
+ ror w19,w19,#16
+.endif
+.inst 0x05a580e7 //revh z7.s,p0/m,z7.s
+.if mixin == 1
+ ror w20,w20,#16
+.endif
+.inst 0x05a5816b //revh z11.s,p0/m,z11.s
+.if mixin == 1
+ ror w21,w21,#16
+.endif
+.inst 0x04af014a //add z10.s,z10.s,z15.s
+.if mixin == 1
+ add w17,w17,w22
+.endif
+.inst 0x04a301ce //add z14.s,z14.s,z3.s
+.if mixin == 1
+ add w18,w18,w19
+.endif
+.inst 0x04a70042 //add z2.s,z2.s,z7.s
+.if mixin == 1
+ add w15,w15,w20
+.endif
+.inst 0x04ab00c6 //add z6.s,z6.s,z11.s
+.if mixin == 1
+ add w16,w16,w21
+.endif
+.inst 0x04aa30a5 //eor z5.d,z5.d,z10.d
+.if mixin == 1
+ eor w12,w12,w17
+.endif
+.inst 0x04ae3129 //eor z9.d,z9.d,z14.d
+.if mixin == 1
+ eor w13,w13,w18
+.endif
+.inst 0x04a231ad //eor z13.d,z13.d,z2.d
+.if mixin == 1
+ eor w14,w14,w15
+.endif
+.inst 0x04a63021 //eor z1.d,z1.d,z6.d
+.if mixin == 1
+ eor w11,w11,w16
+.endif
+.inst 0x046c9cb1 //lsl z17.s,z5.s,12
+.inst 0x046c9d32 //lsl z18.s,z9.s,12
+.inst 0x046c9db3 //lsl z19.s,z13.s,12
+.inst 0x046c9c34 //lsl z20.s,z1.s,12
+.inst 0x046c94a5 //lsr z5.s,z5.s,20
+.if mixin == 1
+ ror w12,w12,20
+.endif
+.inst 0x046c9529 //lsr z9.s,z9.s,20
+.if mixin == 1
+ ror w13,w13,20
+.endif
+.inst 0x046c95ad //lsr z13.s,z13.s,20
+.if mixin == 1
+ ror w14,w14,20
+.endif
+.inst 0x046c9421 //lsr z1.s,z1.s,20
+.if mixin == 1
+ ror w11,w11,20
+.endif
+.inst 0x047130a5 //orr z5.d,z5.d,z17.d
+.inst 0x04723129 //orr z9.d,z9.d,z18.d
+.inst 0x047331ad //orr z13.d,z13.d,z19.d
+.inst 0x04743021 //orr z1.d,z1.d,z20.d
+.inst 0x04a50000 //add z0.s,z0.s,z5.s
+.if mixin == 1
+ add w7,w7,w12
+.endif
+.inst 0x04a90084 //add z4.s,z4.s,z9.s
+.if mixin == 1
+ add w8,w8,w13
+.endif
+.inst 0x04ad0108 //add z8.s,z8.s,z13.s
+.if mixin == 1
+ add w9,w9,w14
+.endif
+.inst 0x04a1018c //add z12.s,z12.s,z1.s
+.if mixin == 1
+ add w10,w10,w11
+.endif
+.inst 0x04a031ef //eor z15.d,z15.d,z0.d
+.if mixin == 1
+ eor w22,w22,w7
+.endif
+.inst 0x04a43063 //eor z3.d,z3.d,z4.d
+.if mixin == 1
+ eor w19,w19,w8
+.endif
+.inst 0x04a830e7 //eor z7.d,z7.d,z8.d
+.if mixin == 1
+ eor w20,w20,w9
+.endif
+.inst 0x04ac316b //eor z11.d,z11.d,z12.d
+.if mixin == 1
+ eor w21,w21,w10
+.endif
+.inst 0x053f31ef //tbl z15.b,{z15.b},z31.b
+.if mixin == 1
+ ror w22,w22,#24
+.endif
+.inst 0x053f3063 //tbl z3.b,{z3.b},z31.b
+.if mixin == 1
+ ror w19,w19,#24
+.endif
+.inst 0x053f30e7 //tbl z7.b,{z7.b},z31.b
+.if mixin == 1
+ ror w20,w20,#24
+.endif
+.inst 0x053f316b //tbl z11.b,{z11.b},z31.b
+.if mixin == 1
+ ror w21,w21,#24
+.endif
+.inst 0x04af014a //add z10.s,z10.s,z15.s
+.if mixin == 1
+ add w17,w17,w22
+.endif
+.inst 0x04a301ce //add z14.s,z14.s,z3.s
+.if mixin == 1
+ add w18,w18,w19
+.endif
+.inst 0x04a70042 //add z2.s,z2.s,z7.s
+.if mixin == 1
+ add w15,w15,w20
+.endif
+.inst 0x04ab00c6 //add z6.s,z6.s,z11.s
+.if mixin == 1
+ add w16,w16,w21
+.endif
+.inst 0x04aa30a5 //eor z5.d,z5.d,z10.d
+.if mixin == 1
+ eor w12,w12,w17
+.endif
+.inst 0x04ae3129 //eor z9.d,z9.d,z14.d
+.if mixin == 1
+ eor w13,w13,w18
+.endif
+.inst 0x04a231ad //eor z13.d,z13.d,z2.d
+.if mixin == 1
+ eor w14,w14,w15
+.endif
+.inst 0x04a63021 //eor z1.d,z1.d,z6.d
+.if mixin == 1
+ eor w11,w11,w16
+.endif
+.inst 0x04679cb1 //lsl z17.s,z5.s,7
+.inst 0x04679d32 //lsl z18.s,z9.s,7
+.inst 0x04679db3 //lsl z19.s,z13.s,7
+.inst 0x04679c34 //lsl z20.s,z1.s,7
+.inst 0x046794a5 //lsr z5.s,z5.s,25
+.if mixin == 1
+ ror w12,w12,25
+.endif
+.inst 0x04679529 //lsr z9.s,z9.s,25
+.if mixin == 1
+ ror w13,w13,25
+.endif
+.inst 0x046795ad //lsr z13.s,z13.s,25
+.if mixin == 1
+ ror w14,w14,25
+.endif
+.inst 0x04679421 //lsr z1.s,z1.s,25
+.if mixin == 1
+ ror w11,w11,25
+.endif
+.inst 0x047130a5 //orr z5.d,z5.d,z17.d
+.inst 0x04723129 //orr z9.d,z9.d,z18.d
+.inst 0x047331ad //orr z13.d,z13.d,z19.d
+.inst 0x04743021 //orr z1.d,z1.d,z20.d
+ sub x6,x6,1
+ cbnz x6,10b
+ lsr x6,x28,#32
+.inst 0x05a03b91 //dup z17.s,w28
+.inst 0x05a038d2 //dup z18.s,w6
+ lsr x6,x29,#32
+.inst 0x05a038d3 //dup z19.s,w6
+ lsr x6,x30,#32
+.if mixin == 1
+ add w7,w7,w23
+.endif
+.inst 0x04b90000 //add z0.s,z0.s,z25.s
+.if mixin == 1
+ add x8,x8,x23,lsr #32
+.endif
+.inst 0x04ba0084 //add z4.s,z4.s,z26.s
+.if mixin == 1
+ add x7,x7,x8,lsl #32 // pack
+.endif
+.if mixin == 1
+ add w9,w9,w24
+.endif
+.inst 0x04bb0108 //add z8.s,z8.s,z27.s
+.if mixin == 1
+ add x10,x10,x24,lsr #32
+.endif
+.inst 0x04bc018c //add z12.s,z12.s,z28.s
+.if mixin == 1
+ add x9,x9,x10,lsl #32 // pack
+.endif
+.if mixin == 1
+ ldp x8,x10,[x1],#16
+.endif
+.if mixin == 1
+ add w11,w11,w25
+.endif
+.inst 0x04bd0021 //add z1.s,z1.s,z29.s
+.if mixin == 1
+ add x12,x12,x25,lsr #32
+.endif
+.inst 0x04be00a5 //add z5.s,z5.s,z30.s
+.if mixin == 1
+ add x11,x11,x12,lsl #32 // pack
+.endif
+.if mixin == 1
+ add w13,w13,w26
+.endif
+.inst 0x04b50129 //add z9.s,z9.s,z21.s
+.if mixin == 1
+ add x14,x14,x26,lsr #32
+.endif
+.inst 0x04b601ad //add z13.s,z13.s,z22.s
+.if mixin == 1
+ add x13,x13,x14,lsl #32 // pack
+.endif
+.if mixin == 1
+ ldp x12,x14,[x1],#16
+.endif
+.if mixin == 1
+ add w15,w15,w27
+.endif
+.inst 0x04b70042 //add z2.s,z2.s,z23.s
+.if mixin == 1
+ add x16,x16,x27,lsr #32
+.endif
+.inst 0x04b800c6 //add z6.s,z6.s,z24.s
+.if mixin == 1
+ add x15,x15,x16,lsl #32 // pack
+.endif
+.if mixin == 1
+ add w17,w17,w28
+.endif
+.inst 0x04b1014a //add z10.s,z10.s,z17.s
+.if mixin == 1
+ add x18,x18,x28,lsr #32
+.endif
+.inst 0x04b201ce //add z14.s,z14.s,z18.s
+.if mixin == 1
+ add x17,x17,x18,lsl #32 // pack
+.endif
+.if mixin == 1
+ ldp x16,x18,[x1],#16
+.endif
+.inst 0x05a03bd4 //dup z20.s,w30
+.inst 0x05a038d9 //dup z25.s,w6 // bak[15] not available for SVE
+.if mixin == 1
+ add w19,w19,w29
+.endif
+.inst 0x04b00063 //add z3.s,z3.s,z16.s
+.if mixin == 1
+ add x20,x20,x29,lsr #32
+.endif
+.inst 0x04b300e7 //add z7.s,z7.s,z19.s
+.if mixin == 1
+ add x19,x19,x20,lsl #32 // pack
+.endif
+.if mixin == 1
+ add w21,w21,w30
+.endif
+.inst 0x04b4016b //add z11.s,z11.s,z20.s
+.if mixin == 1
+ add x22,x22,x30,lsr #32
+.endif
+.inst 0x04b901ef //add z15.s,z15.s,z25.s
+.if mixin == 1
+ add x21,x21,x22,lsl #32 // pack
+.endif
+.if mixin == 1
+ ldp x20,x22,[x1],#16
+.endif
+#ifdef __AARCH64EB__
+ rev x7,x7
+.inst 0x05a48000 //revb z0.s,p0/m,z0.s
+.inst 0x05a48084 //revb z4.s,p0/m,z4.s
+ rev x9,x9
+.inst 0x05a48108 //revb z8.s,p0/m,z8.s
+.inst 0x05a4818c //revb z12.s,p0/m,z12.s
+ rev x11,x11
+.inst 0x05a48021 //revb z1.s,p0/m,z1.s
+.inst 0x05a480a5 //revb z5.s,p0/m,z5.s
+ rev x13,x13
+.inst 0x05a48129 //revb z9.s,p0/m,z9.s
+.inst 0x05a481ad //revb z13.s,p0/m,z13.s
+ rev x15,x15
+.inst 0x05a48042 //revb z2.s,p0/m,z2.s
+.inst 0x05a480c6 //revb z6.s,p0/m,z6.s
+ rev x17,x17
+.inst 0x05a4814a //revb z10.s,p0/m,z10.s
+.inst 0x05a481ce //revb z14.s,p0/m,z14.s
+ rev x19,x19
+.inst 0x05a48063 //revb z3.s,p0/m,z3.s
+.inst 0x05a480e7 //revb z7.s,p0/m,z7.s
+ rev x21,x21
+.inst 0x05a4816b //revb z11.s,p0/m,z11.s
+.inst 0x05a481ef //revb z15.s,p0/m,z15.s
+#endif
+.if mixin == 1
+ add x29,x29,#1
+.endif
+ cmp x5,4
+ b.ne 200f
+.if mixin == 1
+ eor x7,x7,x8
+.endif
+.if mixin == 1
+ eor x9,x9,x10
+.endif
+.if mixin == 1
+ eor x11,x11,x12
+.endif
+.inst 0x05a46011 //zip1 z17.s,z0.s,z4.s
+.inst 0x05a46412 //zip2 z18.s,z0.s,z4.s
+.inst 0x05ac6113 //zip1 z19.s,z8.s,z12.s
+.inst 0x05ac6514 //zip2 z20.s,z8.s,z12.s
+
+.inst 0x05a56035 //zip1 z21.s,z1.s,z5.s
+.inst 0x05a56436 //zip2 z22.s,z1.s,z5.s
+.inst 0x05ad6137 //zip1 z23.s,z9.s,z13.s
+.inst 0x05ad6538 //zip2 z24.s,z9.s,z13.s
+
+.inst 0x05f36220 //zip1 z0.d,z17.d,z19.d
+.inst 0x05f36624 //zip2 z4.d,z17.d,z19.d
+.inst 0x05f46248 //zip1 z8.d,z18.d,z20.d
+.inst 0x05f4664c //zip2 z12.d,z18.d,z20.d
+
+.inst 0x05f762a1 //zip1 z1.d,z21.d,z23.d
+.inst 0x05f766a5 //zip2 z5.d,z21.d,z23.d
+.inst 0x05f862c9 //zip1 z9.d,z22.d,z24.d
+.inst 0x05f866cd //zip2 z13.d,z22.d,z24.d
+.if mixin == 1
+ eor x13,x13,x14
+.endif
+.if mixin == 1
+ eor x15,x15,x16
+.endif
+.if mixin == 1
+ eor x17,x17,x18
+.endif
+.inst 0x05a66051 //zip1 z17.s,z2.s,z6.s
+.inst 0x05a66452 //zip2 z18.s,z2.s,z6.s
+.inst 0x05ae6153 //zip1 z19.s,z10.s,z14.s
+.inst 0x05ae6554 //zip2 z20.s,z10.s,z14.s
+
+.inst 0x05a76075 //zip1 z21.s,z3.s,z7.s
+.inst 0x05a76476 //zip2 z22.s,z3.s,z7.s
+.inst 0x05af6177 //zip1 z23.s,z11.s,z15.s
+.inst 0x05af6578 //zip2 z24.s,z11.s,z15.s
+
+.inst 0x05f36222 //zip1 z2.d,z17.d,z19.d
+.inst 0x05f36626 //zip2 z6.d,z17.d,z19.d
+.inst 0x05f4624a //zip1 z10.d,z18.d,z20.d
+.inst 0x05f4664e //zip2 z14.d,z18.d,z20.d
+
+.inst 0x05f762a3 //zip1 z3.d,z21.d,z23.d
+.inst 0x05f766a7 //zip2 z7.d,z21.d,z23.d
+.inst 0x05f862cb //zip1 z11.d,z22.d,z24.d
+.inst 0x05f866cf //zip2 z15.d,z22.d,z24.d
+.if mixin == 1
+ eor x19,x19,x20
+.endif
+.if mixin == 1
+ eor x21,x21,x22
+.endif
+ ld1 {v17.4s,v18.4s,v19.4s,v20.4s},[x1],#64
+ ld1 {v21.4s,v22.4s,v23.4s,v24.4s},[x1],#64
+.inst 0x04b13000 //eor z0.d,z0.d,z17.d
+.inst 0x04b23021 //eor z1.d,z1.d,z18.d
+.inst 0x04b33042 //eor z2.d,z2.d,z19.d
+.inst 0x04b43063 //eor z3.d,z3.d,z20.d
+.inst 0x04b53084 //eor z4.d,z4.d,z21.d
+.inst 0x04b630a5 //eor z5.d,z5.d,z22.d
+.inst 0x04b730c6 //eor z6.d,z6.d,z23.d
+.inst 0x04b830e7 //eor z7.d,z7.d,z24.d
+ ld1 {v17.4s,v18.4s,v19.4s,v20.4s},[x1],#64
+ ld1 {v21.4s,v22.4s,v23.4s,v24.4s},[x1],#64
+.if mixin == 1
+ stp x7,x9,[x0],#16
+.endif
+.inst 0x04b13108 //eor z8.d,z8.d,z17.d
+.inst 0x04b23129 //eor z9.d,z9.d,z18.d
+.if mixin == 1
+ stp x11,x13,[x0],#16
+.endif
+.inst 0x04b3314a //eor z10.d,z10.d,z19.d
+.inst 0x04b4316b //eor z11.d,z11.d,z20.d
+.if mixin == 1
+ stp x15,x17,[x0],#16
+.endif
+.inst 0x04b5318c //eor z12.d,z12.d,z21.d
+.inst 0x04b631ad //eor z13.d,z13.d,z22.d
+.if mixin == 1
+ stp x19,x21,[x0],#16
+.endif
+.inst 0x04b731ce //eor z14.d,z14.d,z23.d
+.inst 0x04b831ef //eor z15.d,z15.d,z24.d
+ st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x0],#64
+ st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
+ st1 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64
+ st1 {v12.4s,v13.4s,v14.4s,v15.4s},[x0],#64
+ b 210f
+200:
+.inst 0x05a16011 //zip1 z17.s,z0.s,z1.s
+.inst 0x05a16412 //zip2 z18.s,z0.s,z1.s
+.inst 0x05a36053 //zip1 z19.s,z2.s,z3.s
+.inst 0x05a36454 //zip2 z20.s,z2.s,z3.s
+
+.inst 0x05a56095 //zip1 z21.s,z4.s,z5.s
+.inst 0x05a56496 //zip2 z22.s,z4.s,z5.s
+.inst 0x05a760d7 //zip1 z23.s,z6.s,z7.s
+.inst 0x05a764d8 //zip2 z24.s,z6.s,z7.s
+
+.inst 0x05f36220 //zip1 z0.d,z17.d,z19.d
+.inst 0x05f36621 //zip2 z1.d,z17.d,z19.d
+.inst 0x05f46242 //zip1 z2.d,z18.d,z20.d
+.inst 0x05f46643 //zip2 z3.d,z18.d,z20.d
+
+.inst 0x05f762a4 //zip1 z4.d,z21.d,z23.d
+.inst 0x05f766a5 //zip2 z5.d,z21.d,z23.d
+.inst 0x05f862c6 //zip1 z6.d,z22.d,z24.d
+.inst 0x05f866c7 //zip2 z7.d,z22.d,z24.d
+.if mixin == 1
+ eor x7,x7,x8
+.endif
+.if mixin == 1
+ eor x9,x9,x10
+.endif
+.inst 0x05a96111 //zip1 z17.s,z8.s,z9.s
+.inst 0x05a96512 //zip2 z18.s,z8.s,z9.s
+.inst 0x05ab6153 //zip1 z19.s,z10.s,z11.s
+.inst 0x05ab6554 //zip2 z20.s,z10.s,z11.s
+
+.inst 0x05ad6195 //zip1 z21.s,z12.s,z13.s
+.inst 0x05ad6596 //zip2 z22.s,z12.s,z13.s
+.inst 0x05af61d7 //zip1 z23.s,z14.s,z15.s
+.inst 0x05af65d8 //zip2 z24.s,z14.s,z15.s
+
+.inst 0x05f36228 //zip1 z8.d,z17.d,z19.d
+.inst 0x05f36629 //zip2 z9.d,z17.d,z19.d
+.inst 0x05f4624a //zip1 z10.d,z18.d,z20.d
+.inst 0x05f4664b //zip2 z11.d,z18.d,z20.d
+
+.inst 0x05f762ac //zip1 z12.d,z21.d,z23.d
+.inst 0x05f766ad //zip2 z13.d,z21.d,z23.d
+.inst 0x05f862ce //zip1 z14.d,z22.d,z24.d
+.inst 0x05f866cf //zip2 z15.d,z22.d,z24.d
+.if mixin == 1
+ eor x11,x11,x12
+.endif
+.if mixin == 1
+ eor x13,x13,x14
+.endif
+.inst 0x05a46011 //zip1 z17.s,z0.s,z4.s
+.inst 0x05a46412 //zip2 z18.s,z0.s,z4.s
+.inst 0x05ac6113 //zip1 z19.s,z8.s,z12.s
+.inst 0x05ac6514 //zip2 z20.s,z8.s,z12.s
+
+.inst 0x05a56035 //zip1 z21.s,z1.s,z5.s
+.inst 0x05a56436 //zip2 z22.s,z1.s,z5.s
+.inst 0x05ad6137 //zip1 z23.s,z9.s,z13.s
+.inst 0x05ad6538 //zip2 z24.s,z9.s,z13.s
+
+.inst 0x05f36220 //zip1 z0.d,z17.d,z19.d
+.inst 0x05f36624 //zip2 z4.d,z17.d,z19.d
+.inst 0x05f46248 //zip1 z8.d,z18.d,z20.d
+.inst 0x05f4664c //zip2 z12.d,z18.d,z20.d
+
+.inst 0x05f762a1 //zip1 z1.d,z21.d,z23.d
+.inst 0x05f766a5 //zip2 z5.d,z21.d,z23.d
+.inst 0x05f862c9 //zip1 z9.d,z22.d,z24.d
+.inst 0x05f866cd //zip2 z13.d,z22.d,z24.d
+.if mixin == 1
+ eor x15,x15,x16
+.endif
+.if mixin == 1
+ eor x17,x17,x18
+.endif
+.inst 0x05a66051 //zip1 z17.s,z2.s,z6.s
+.inst 0x05a66452 //zip2 z18.s,z2.s,z6.s
+.inst 0x05ae6153 //zip1 z19.s,z10.s,z14.s
+.inst 0x05ae6554 //zip2 z20.s,z10.s,z14.s
+
+.inst 0x05a76075 //zip1 z21.s,z3.s,z7.s
+.inst 0x05a76476 //zip2 z22.s,z3.s,z7.s
+.inst 0x05af6177 //zip1 z23.s,z11.s,z15.s
+.inst 0x05af6578 //zip2 z24.s,z11.s,z15.s
+
+.inst 0x05f36222 //zip1 z2.d,z17.d,z19.d
+.inst 0x05f36626 //zip2 z6.d,z17.d,z19.d
+.inst 0x05f4624a //zip1 z10.d,z18.d,z20.d
+.inst 0x05f4664e //zip2 z14.d,z18.d,z20.d
+
+.inst 0x05f762a3 //zip1 z3.d,z21.d,z23.d
+.inst 0x05f766a7 //zip2 z7.d,z21.d,z23.d
+.inst 0x05f862cb //zip1 z11.d,z22.d,z24.d
+.inst 0x05f866cf //zip2 z15.d,z22.d,z24.d
+.if mixin == 1
+ eor x19,x19,x20
+.endif
+.if mixin == 1
+ eor x21,x21,x22
+.endif
+.inst 0xa540a031 //ld1w {z17.s},p0/z,[x1,#0,MUL VL]
+.inst 0xa541a032 //ld1w {z18.s},p0/z,[x1,#1,MUL VL]
+.inst 0xa542a033 //ld1w {z19.s},p0/z,[x1,#2,MUL VL]
+.inst 0xa543a034 //ld1w {z20.s},p0/z,[x1,#3,MUL VL]
+.inst 0xa544a035 //ld1w {z21.s},p0/z,[x1,#4,MUL VL]
+.inst 0xa545a036 //ld1w {z22.s},p0/z,[x1,#5,MUL VL]
+.inst 0xa546a037 //ld1w {z23.s},p0/z,[x1,#6,MUL VL]
+.inst 0xa547a038 //ld1w {z24.s},p0/z,[x1,#7,MUL VL]
+.inst 0x04215101 //addvl x1,x1,8
+.inst 0x04b13000 //eor z0.d,z0.d,z17.d
+.inst 0x04b23084 //eor z4.d,z4.d,z18.d
+.inst 0x04b33108 //eor z8.d,z8.d,z19.d
+.inst 0x04b4318c //eor z12.d,z12.d,z20.d
+.inst 0x04b53021 //eor z1.d,z1.d,z21.d
+.inst 0x04b630a5 //eor z5.d,z5.d,z22.d
+.inst 0x04b73129 //eor z9.d,z9.d,z23.d
+.inst 0x04b831ad //eor z13.d,z13.d,z24.d
+.inst 0xa540a031 //ld1w {z17.s},p0/z,[x1,#0,MUL VL]
+.inst 0xa541a032 //ld1w {z18.s},p0/z,[x1,#1,MUL VL]
+.inst 0xa542a033 //ld1w {z19.s},p0/z,[x1,#2,MUL VL]
+.inst 0xa543a034 //ld1w {z20.s},p0/z,[x1,#3,MUL VL]
+.inst 0xa544a035 //ld1w {z21.s},p0/z,[x1,#4,MUL VL]
+.inst 0xa545a036 //ld1w {z22.s},p0/z,[x1,#5,MUL VL]
+.inst 0xa546a037 //ld1w {z23.s},p0/z,[x1,#6,MUL VL]
+.inst 0xa547a038 //ld1w {z24.s},p0/z,[x1,#7,MUL VL]
+.inst 0x04215101 //addvl x1,x1,8
+.if mixin == 1
+ stp x7,x9,[x0],#16
+.endif
+.inst 0x04b13042 //eor z2.d,z2.d,z17.d
+.inst 0x04b230c6 //eor z6.d,z6.d,z18.d
+.if mixin == 1
+ stp x11,x13,[x0],#16
+.endif
+.inst 0x04b3314a //eor z10.d,z10.d,z19.d
+.inst 0x04b431ce //eor z14.d,z14.d,z20.d
+.if mixin == 1
+ stp x15,x17,[x0],#16
+.endif
+.inst 0x04b53063 //eor z3.d,z3.d,z21.d
+.inst 0x04b630e7 //eor z7.d,z7.d,z22.d
+.if mixin == 1
+ stp x19,x21,[x0],#16
+.endif
+.inst 0x04b7316b //eor z11.d,z11.d,z23.d
+.inst 0x04b831ef //eor z15.d,z15.d,z24.d
+.inst 0xe540e000 //st1w {z0.s},p0,[x0,#0,MUL VL]
+.inst 0xe541e004 //st1w {z4.s},p0,[x0,#1,MUL VL]
+.inst 0xe542e008 //st1w {z8.s},p0,[x0,#2,MUL VL]
+.inst 0xe543e00c //st1w {z12.s},p0,[x0,#3,MUL VL]
+.inst 0xe544e001 //st1w {z1.s},p0,[x0,#4,MUL VL]
+.inst 0xe545e005 //st1w {z5.s},p0,[x0,#5,MUL VL]
+.inst 0xe546e009 //st1w {z9.s},p0,[x0,#6,MUL VL]
+.inst 0xe547e00d //st1w {z13.s},p0,[x0,#7,MUL VL]
+.inst 0x04205100 //addvl x0,x0,8
+.inst 0xe540e002 //st1w {z2.s},p0,[x0,#0,MUL VL]
+.inst 0xe541e006 //st1w {z6.s},p0,[x0,#1,MUL VL]
+.inst 0xe542e00a //st1w {z10.s},p0,[x0,#2,MUL VL]
+.inst 0xe543e00e //st1w {z14.s},p0,[x0,#3,MUL VL]
+.inst 0xe544e003 //st1w {z3.s},p0,[x0,#4,MUL VL]
+.inst 0xe545e007 //st1w {z7.s},p0,[x0,#5,MUL VL]
+.inst 0xe546e00b //st1w {z11.s},p0,[x0,#6,MUL VL]
+.inst 0xe547e00f //st1w {z15.s},p0,[x0,#7,MUL VL]
+.inst 0x04205100 //addvl x0,x0,8
+210:
+.inst 0x04b0e3fd //incw x29, ALL, MUL #1
+ subs x2,x2,64
+ b.gt 100b
+ b 110f
+101:
+ mixin=0
+ lsr x8,x23,#32
+.inst 0x05a03ae0 //dup z0.s,w23
+.inst 0x05a03af9 //dup z25.s,w23
+.if mixin == 1
+ mov w7,w23
+.endif
+.inst 0x05a03904 //dup z4.s,w8
+.inst 0x05a0391a //dup z26.s,w8
+ lsr x10,x24,#32
+.inst 0x05a03b08 //dup z8.s,w24
+.inst 0x05a03b1b //dup z27.s,w24
+.if mixin == 1
+ mov w9,w24
+.endif
+.inst 0x05a0394c //dup z12.s,w10
+.inst 0x05a0395c //dup z28.s,w10
+ lsr x12,x25,#32
+.inst 0x05a03b21 //dup z1.s,w25
+.inst 0x05a03b3d //dup z29.s,w25
+.if mixin == 1
+ mov w11,w25
+.endif
+.inst 0x05a03985 //dup z5.s,w12
+.inst 0x05a0399e //dup z30.s,w12
+ lsr x14,x26,#32
+.inst 0x05a03b49 //dup z9.s,w26
+.inst 0x05a03b55 //dup z21.s,w26
+.if mixin == 1
+ mov w13,w26
+.endif
+.inst 0x05a039cd //dup z13.s,w14
+.inst 0x05a039d6 //dup z22.s,w14
+ lsr x16,x27,#32
+.inst 0x05a03b62 //dup z2.s,w27
+.inst 0x05a03b77 //dup z23.s,w27
+.if mixin == 1
+ mov w15,w27
+.endif
+.inst 0x05a03a06 //dup z6.s,w16
+.inst 0x05a03a18 //dup z24.s,w16
+ lsr x18,x28,#32
+.inst 0x05a03b8a //dup z10.s,w28
+.if mixin == 1
+ mov w17,w28
+.endif
+.inst 0x05a03a4e //dup z14.s,w18
+ lsr x22,x30,#32
+.inst 0x05a03bcb //dup z11.s,w30
+.if mixin == 1
+ mov w21,w30
+.endif
+.inst 0x05a03acf //dup z15.s,w22
+.if mixin == 1
+ add w20,w29,#1
+ mov w19,w29
+.inst 0x04a14690 //index z16.s,w20,1
+.inst 0x04a14683 //index z3.s,w20,1
+.else
+.inst 0x04a147b0 //index z16.s,w29,1
+.inst 0x04a147a3 //index z3.s,w29,1
+.endif
+ lsr x20,x29,#32
+.inst 0x05a03a87 //dup z7.s,w20
+ mov x6,#10
+10:
+.align 5
+.inst 0x04a10000 //add z0.s,z0.s,z1.s
+.if mixin == 1
+ add w7,w7,w11
+.endif
+.inst 0x04a50084 //add z4.s,z4.s,z5.s
+.if mixin == 1
+ add w8,w8,w12
+.endif
+.inst 0x04a90108 //add z8.s,z8.s,z9.s
+.if mixin == 1
+ add w9,w9,w13
+.endif
+.inst 0x04ad018c //add z12.s,z12.s,z13.s
+.if mixin == 1
+ add w10,w10,w14
+.endif
+.inst 0x04a03063 //eor z3.d,z3.d,z0.d
+.if mixin == 1
+ eor w19,w19,w7
+.endif
+.inst 0x04a430e7 //eor z7.d,z7.d,z4.d
+.if mixin == 1
+ eor w20,w20,w8
+.endif
+.inst 0x04a8316b //eor z11.d,z11.d,z8.d
+.if mixin == 1
+ eor w21,w21,w9
+.endif
+.inst 0x04ac31ef //eor z15.d,z15.d,z12.d
+.if mixin == 1
+ eor w22,w22,w10
+.endif
+.inst 0x05a58063 //revh z3.s,p0/m,z3.s
+.if mixin == 1
+ ror w19,w19,#16
+.endif
+.inst 0x05a580e7 //revh z7.s,p0/m,z7.s
+.if mixin == 1
+ ror w20,w20,#16
+.endif
+.inst 0x05a5816b //revh z11.s,p0/m,z11.s
+.if mixin == 1
+ ror w21,w21,#16
+.endif
+.inst 0x05a581ef //revh z15.s,p0/m,z15.s
+.if mixin == 1
+ ror w22,w22,#16
+.endif
+.inst 0x04a30042 //add z2.s,z2.s,z3.s
+.if mixin == 1
+ add w15,w15,w19
+.endif
+.inst 0x04a700c6 //add z6.s,z6.s,z7.s
+.if mixin == 1
+ add w16,w16,w20
+.endif
+.inst 0x04ab014a //add z10.s,z10.s,z11.s
+.if mixin == 1
+ add w17,w17,w21
+.endif
+.inst 0x04af01ce //add z14.s,z14.s,z15.s
+.if mixin == 1
+ add w18,w18,w22
+.endif
+.inst 0x04a23021 //eor z1.d,z1.d,z2.d
+.if mixin == 1
+ eor w11,w11,w15
+.endif
+.inst 0x04a630a5 //eor z5.d,z5.d,z6.d
+.if mixin == 1
+ eor w12,w12,w16
+.endif
+.inst 0x04aa3129 //eor z9.d,z9.d,z10.d
+.if mixin == 1
+ eor w13,w13,w17
+.endif
+.inst 0x04ae31ad //eor z13.d,z13.d,z14.d
+.if mixin == 1
+ eor w14,w14,w18
+.endif
+.inst 0x046c9c31 //lsl z17.s,z1.s,12
+.inst 0x046c9cb2 //lsl z18.s,z5.s,12
+.inst 0x046c9d33 //lsl z19.s,z9.s,12
+.inst 0x046c9db4 //lsl z20.s,z13.s,12
+.inst 0x046c9421 //lsr z1.s,z1.s,20
+.if mixin == 1
+ ror w11,w11,20
+.endif
+.inst 0x046c94a5 //lsr z5.s,z5.s,20
+.if mixin == 1
+ ror w12,w12,20
+.endif
+.inst 0x046c9529 //lsr z9.s,z9.s,20
+.if mixin == 1
+ ror w13,w13,20
+.endif
+.inst 0x046c95ad //lsr z13.s,z13.s,20
+.if mixin == 1
+ ror w14,w14,20
+.endif
+.inst 0x04713021 //orr z1.d,z1.d,z17.d
+.inst 0x047230a5 //orr z5.d,z5.d,z18.d
+.inst 0x04733129 //orr z9.d,z9.d,z19.d
+.inst 0x047431ad //orr z13.d,z13.d,z20.d
+.inst 0x04a10000 //add z0.s,z0.s,z1.s
+.if mixin == 1
+ add w7,w7,w11
+.endif
+.inst 0x04a50084 //add z4.s,z4.s,z5.s
+.if mixin == 1
+ add w8,w8,w12
+.endif
+.inst 0x04a90108 //add z8.s,z8.s,z9.s
+.if mixin == 1
+ add w9,w9,w13
+.endif
+.inst 0x04ad018c //add z12.s,z12.s,z13.s
+.if mixin == 1
+ add w10,w10,w14
+.endif
+.inst 0x04a03063 //eor z3.d,z3.d,z0.d
+.if mixin == 1
+ eor w19,w19,w7
+.endif
+.inst 0x04a430e7 //eor z7.d,z7.d,z4.d
+.if mixin == 1
+ eor w20,w20,w8
+.endif
+.inst 0x04a8316b //eor z11.d,z11.d,z8.d
+.if mixin == 1
+ eor w21,w21,w9
+.endif
+.inst 0x04ac31ef //eor z15.d,z15.d,z12.d
+.if mixin == 1
+ eor w22,w22,w10
+.endif
+.inst 0x053f3063 //tbl z3.b,{z3.b},z31.b
+.if mixin == 1
+ ror w19,w19,#24
+.endif
+.inst 0x053f30e7 //tbl z7.b,{z7.b},z31.b
+.if mixin == 1
+ ror w20,w20,#24
+.endif
+.inst 0x053f316b //tbl z11.b,{z11.b},z31.b
+.if mixin == 1
+ ror w21,w21,#24
+.endif
+.inst 0x053f31ef //tbl z15.b,{z15.b},z31.b
+.if mixin == 1
+ ror w22,w22,#24
+.endif
+.inst 0x04a30042 //add z2.s,z2.s,z3.s
+.if mixin == 1
+ add w15,w15,w19
+.endif
+.inst 0x04a700c6 //add z6.s,z6.s,z7.s
+.if mixin == 1
+ add w16,w16,w20
+.endif
+.inst 0x04ab014a //add z10.s,z10.s,z11.s
+.if mixin == 1
+ add w17,w17,w21
+.endif
+.inst 0x04af01ce //add z14.s,z14.s,z15.s
+.if mixin == 1
+ add w18,w18,w22
+.endif
+.inst 0x04a23021 //eor z1.d,z1.d,z2.d
+.if mixin == 1
+ eor w11,w11,w15
+.endif
+.inst 0x04a630a5 //eor z5.d,z5.d,z6.d
+.if mixin == 1
+ eor w12,w12,w16
+.endif
+.inst 0x04aa3129 //eor z9.d,z9.d,z10.d
+.if mixin == 1
+ eor w13,w13,w17
+.endif
+.inst 0x04ae31ad //eor z13.d,z13.d,z14.d
+.if mixin == 1
+ eor w14,w14,w18
+.endif
+.inst 0x04679c31 //lsl z17.s,z1.s,7
+.inst 0x04679cb2 //lsl z18.s,z5.s,7
+.inst 0x04679d33 //lsl z19.s,z9.s,7
+.inst 0x04679db4 //lsl z20.s,z13.s,7
+.inst 0x04679421 //lsr z1.s,z1.s,25
+.if mixin == 1
+ ror w11,w11,25
+.endif
+.inst 0x046794a5 //lsr z5.s,z5.s,25
+.if mixin == 1
+ ror w12,w12,25
+.endif
+.inst 0x04679529 //lsr z9.s,z9.s,25
+.if mixin == 1
+ ror w13,w13,25
+.endif
+.inst 0x046795ad //lsr z13.s,z13.s,25
+.if mixin == 1
+ ror w14,w14,25
+.endif
+.inst 0x04713021 //orr z1.d,z1.d,z17.d
+.inst 0x047230a5 //orr z5.d,z5.d,z18.d
+.inst 0x04733129 //orr z9.d,z9.d,z19.d
+.inst 0x047431ad //orr z13.d,z13.d,z20.d
+.inst 0x04a50000 //add z0.s,z0.s,z5.s
+.if mixin == 1
+ add w7,w7,w12
+.endif
+.inst 0x04a90084 //add z4.s,z4.s,z9.s
+.if mixin == 1
+ add w8,w8,w13
+.endif
+.inst 0x04ad0108 //add z8.s,z8.s,z13.s
+.if mixin == 1
+ add w9,w9,w14
+.endif
+.inst 0x04a1018c //add z12.s,z12.s,z1.s
+.if mixin == 1
+ add w10,w10,w11
+.endif
+.inst 0x04a031ef //eor z15.d,z15.d,z0.d
+.if mixin == 1
+ eor w22,w22,w7
+.endif
+.inst 0x04a43063 //eor z3.d,z3.d,z4.d
+.if mixin == 1
+ eor w19,w19,w8
+.endif
+.inst 0x04a830e7 //eor z7.d,z7.d,z8.d
+.if mixin == 1
+ eor w20,w20,w9
+.endif
+.inst 0x04ac316b //eor z11.d,z11.d,z12.d
+.if mixin == 1
+ eor w21,w21,w10
+.endif
+.inst 0x05a581ef //revh z15.s,p0/m,z15.s
+.if mixin == 1
+ ror w22,w22,#16
+.endif
+.inst 0x05a58063 //revh z3.s,p0/m,z3.s
+.if mixin == 1
+ ror w19,w19,#16
+.endif
+.inst 0x05a580e7 //revh z7.s,p0/m,z7.s
+.if mixin == 1
+ ror w20,w20,#16
+.endif
+.inst 0x05a5816b //revh z11.s,p0/m,z11.s
+.if mixin == 1
+ ror w21,w21,#16
+.endif
+.inst 0x04af014a //add z10.s,z10.s,z15.s
+.if mixin == 1
+ add w17,w17,w22
+.endif
+.inst 0x04a301ce //add z14.s,z14.s,z3.s
+.if mixin == 1
+ add w18,w18,w19
+.endif
+.inst 0x04a70042 //add z2.s,z2.s,z7.s
+.if mixin == 1
+ add w15,w15,w20
+.endif
+.inst 0x04ab00c6 //add z6.s,z6.s,z11.s
+.if mixin == 1
+ add w16,w16,w21
+.endif
+.inst 0x04aa30a5 //eor z5.d,z5.d,z10.d
+.if mixin == 1
+ eor w12,w12,w17
+.endif
+.inst 0x04ae3129 //eor z9.d,z9.d,z14.d
+.if mixin == 1
+ eor w13,w13,w18
+.endif
+.inst 0x04a231ad //eor z13.d,z13.d,z2.d
+.if mixin == 1
+ eor w14,w14,w15
+.endif
+.inst 0x04a63021 //eor z1.d,z1.d,z6.d
+.if mixin == 1
+ eor w11,w11,w16
+.endif
+.inst 0x046c9cb1 //lsl z17.s,z5.s,12
+.inst 0x046c9d32 //lsl z18.s,z9.s,12
+.inst 0x046c9db3 //lsl z19.s,z13.s,12
+.inst 0x046c9c34 //lsl z20.s,z1.s,12
+.inst 0x046c94a5 //lsr z5.s,z5.s,20
+.if mixin == 1
+ ror w12,w12,20
+.endif
+.inst 0x046c9529 //lsr z9.s,z9.s,20
+.if mixin == 1
+ ror w13,w13,20
+.endif
+.inst 0x046c95ad //lsr z13.s,z13.s,20
+.if mixin == 1
+ ror w14,w14,20
+.endif
+.inst 0x046c9421 //lsr z1.s,z1.s,20
+.if mixin == 1
+ ror w11,w11,20
+.endif
+.inst 0x047130a5 //orr z5.d,z5.d,z17.d
+.inst 0x04723129 //orr z9.d,z9.d,z18.d
+.inst 0x047331ad //orr z13.d,z13.d,z19.d
+.inst 0x04743021 //orr z1.d,z1.d,z20.d
+.inst 0x04a50000 //add z0.s,z0.s,z5.s
+.if mixin == 1
+ add w7,w7,w12
+.endif
+.inst 0x04a90084 //add z4.s,z4.s,z9.s
+.if mixin == 1
+ add w8,w8,w13
+.endif
+.inst 0x04ad0108 //add z8.s,z8.s,z13.s
+.if mixin == 1
+ add w9,w9,w14
+.endif
+.inst 0x04a1018c //add z12.s,z12.s,z1.s
+.if mixin == 1
+ add w10,w10,w11
+.endif
+.inst 0x04a031ef //eor z15.d,z15.d,z0.d
+.if mixin == 1
+ eor w22,w22,w7
+.endif
+.inst 0x04a43063 //eor z3.d,z3.d,z4.d
+.if mixin == 1
+ eor w19,w19,w8
+.endif
+.inst 0x04a830e7 //eor z7.d,z7.d,z8.d
+.if mixin == 1
+ eor w20,w20,w9
+.endif
+.inst 0x04ac316b //eor z11.d,z11.d,z12.d
+.if mixin == 1
+ eor w21,w21,w10
+.endif
+.inst 0x053f31ef //tbl z15.b,{z15.b},z31.b
+.if mixin == 1
+ ror w22,w22,#24
+.endif
+.inst 0x053f3063 //tbl z3.b,{z3.b},z31.b
+.if mixin == 1
+ ror w19,w19,#24
+.endif
+.inst 0x053f30e7 //tbl z7.b,{z7.b},z31.b
+.if mixin == 1
+ ror w20,w20,#24
+.endif
+.inst 0x053f316b //tbl z11.b,{z11.b},z31.b
+.if mixin == 1
+ ror w21,w21,#24
+.endif
+.inst 0x04af014a //add z10.s,z10.s,z15.s
+.if mixin == 1
+ add w17,w17,w22
+.endif
+.inst 0x04a301ce //add z14.s,z14.s,z3.s
+.if mixin == 1
+ add w18,w18,w19
+.endif
+.inst 0x04a70042 //add z2.s,z2.s,z7.s
+.if mixin == 1
+ add w15,w15,w20
+.endif
+.inst 0x04ab00c6 //add z6.s,z6.s,z11.s
+.if mixin == 1
+ add w16,w16,w21
+.endif
+.inst 0x04aa30a5 //eor z5.d,z5.d,z10.d
+.if mixin == 1
+ eor w12,w12,w17
+.endif
+.inst 0x04ae3129 //eor z9.d,z9.d,z14.d
+.if mixin == 1
+ eor w13,w13,w18
+.endif
+.inst 0x04a231ad //eor z13.d,z13.d,z2.d
+.if mixin == 1
+ eor w14,w14,w15
+.endif
+.inst 0x04a63021 //eor z1.d,z1.d,z6.d
+.if mixin == 1
+ eor w11,w11,w16
+.endif
+.inst 0x04679cb1 //lsl z17.s,z5.s,7
+.inst 0x04679d32 //lsl z18.s,z9.s,7
+.inst 0x04679db3 //lsl z19.s,z13.s,7
+.inst 0x04679c34 //lsl z20.s,z1.s,7
+.inst 0x046794a5 //lsr z5.s,z5.s,25
+.if mixin == 1
+ ror w12,w12,25
+.endif
+.inst 0x04679529 //lsr z9.s,z9.s,25
+.if mixin == 1
+ ror w13,w13,25
+.endif
+.inst 0x046795ad //lsr z13.s,z13.s,25
+.if mixin == 1
+ ror w14,w14,25
+.endif
+.inst 0x04679421 //lsr z1.s,z1.s,25
+.if mixin == 1
+ ror w11,w11,25
+.endif
+.inst 0x047130a5 //orr z5.d,z5.d,z17.d
+.inst 0x04723129 //orr z9.d,z9.d,z18.d
+.inst 0x047331ad //orr z13.d,z13.d,z19.d
+.inst 0x04743021 //orr z1.d,z1.d,z20.d
+ sub x6,x6,1
+ cbnz x6,10b
+ lsr x6,x28,#32
+.inst 0x05a03b91 //dup z17.s,w28
+.inst 0x05a038d2 //dup z18.s,w6
+ lsr x6,x29,#32
+.inst 0x05a038d3 //dup z19.s,w6
+ lsr x6,x30,#32
+.if mixin == 1
+ add w7,w7,w23
+.endif
+.inst 0x04b90000 //add z0.s,z0.s,z25.s
+.if mixin == 1
+ add x8,x8,x23,lsr #32
+.endif
+.inst 0x04ba0084 //add z4.s,z4.s,z26.s
+.if mixin == 1
+ add x7,x7,x8,lsl #32 // pack
+.endif
+.if mixin == 1
+ add w9,w9,w24
+.endif
+.inst 0x04bb0108 //add z8.s,z8.s,z27.s
+.if mixin == 1
+ add x10,x10,x24,lsr #32
+.endif
+.inst 0x04bc018c //add z12.s,z12.s,z28.s
+.if mixin == 1
+ add x9,x9,x10,lsl #32 // pack
+.endif
+.if mixin == 1
+ ldp x8,x10,[x1],#16
+.endif
+.if mixin == 1
+ add w11,w11,w25
+.endif
+.inst 0x04bd0021 //add z1.s,z1.s,z29.s
+.if mixin == 1
+ add x12,x12,x25,lsr #32
+.endif
+.inst 0x04be00a5 //add z5.s,z5.s,z30.s
+.if mixin == 1
+ add x11,x11,x12,lsl #32 // pack
+.endif
+.if mixin == 1
+ add w13,w13,w26
+.endif
+.inst 0x04b50129 //add z9.s,z9.s,z21.s
+.if mixin == 1
+ add x14,x14,x26,lsr #32
+.endif
+.inst 0x04b601ad //add z13.s,z13.s,z22.s
+.if mixin == 1
+ add x13,x13,x14,lsl #32 // pack
+.endif
+.if mixin == 1
+ ldp x12,x14,[x1],#16
+.endif
+.if mixin == 1
+ add w15,w15,w27
+.endif
+.inst 0x04b70042 //add z2.s,z2.s,z23.s
+.if mixin == 1
+ add x16,x16,x27,lsr #32
+.endif
+.inst 0x04b800c6 //add z6.s,z6.s,z24.s
+.if mixin == 1
+ add x15,x15,x16,lsl #32 // pack
+.endif
+.if mixin == 1
+ add w17,w17,w28
+.endif
+.inst 0x04b1014a //add z10.s,z10.s,z17.s
+.if mixin == 1
+ add x18,x18,x28,lsr #32
+.endif
+.inst 0x04b201ce //add z14.s,z14.s,z18.s
+.if mixin == 1
+ add x17,x17,x18,lsl #32 // pack
+.endif
+.if mixin == 1
+ ldp x16,x18,[x1],#16
+.endif
+.inst 0x05a03bd4 //dup z20.s,w30
+.inst 0x05a038d9 //dup z25.s,w6 // bak[15] not available for SVE
+.if mixin == 1
+ add w19,w19,w29
+.endif
+.inst 0x04b00063 //add z3.s,z3.s,z16.s
+.if mixin == 1
+ add x20,x20,x29,lsr #32
+.endif
+.inst 0x04b300e7 //add z7.s,z7.s,z19.s
+.if mixin == 1
+ add x19,x19,x20,lsl #32 // pack
+.endif
+.if mixin == 1
+ add w21,w21,w30
+.endif
+.inst 0x04b4016b //add z11.s,z11.s,z20.s
+.if mixin == 1
+ add x22,x22,x30,lsr #32
+.endif
+.inst 0x04b901ef //add z15.s,z15.s,z25.s
+.if mixin == 1
+ add x21,x21,x22,lsl #32 // pack
+.endif
+.if mixin == 1
+ ldp x20,x22,[x1],#16
+.endif
+#ifdef __AARCH64EB__
+ rev x7,x7
+.inst 0x05a48000 //revb z0.s,p0/m,z0.s
+.inst 0x05a48084 //revb z4.s,p0/m,z4.s
+ rev x9,x9
+.inst 0x05a48108 //revb z8.s,p0/m,z8.s
+.inst 0x05a4818c //revb z12.s,p0/m,z12.s
+ rev x11,x11
+.inst 0x05a48021 //revb z1.s,p0/m,z1.s
+.inst 0x05a480a5 //revb z5.s,p0/m,z5.s
+ rev x13,x13
+.inst 0x05a48129 //revb z9.s,p0/m,z9.s
+.inst 0x05a481ad //revb z13.s,p0/m,z13.s
+ rev x15,x15
+.inst 0x05a48042 //revb z2.s,p0/m,z2.s
+.inst 0x05a480c6 //revb z6.s,p0/m,z6.s
+ rev x17,x17
+.inst 0x05a4814a //revb z10.s,p0/m,z10.s
+.inst 0x05a481ce //revb z14.s,p0/m,z14.s
+ rev x19,x19
+.inst 0x05a48063 //revb z3.s,p0/m,z3.s
+.inst 0x05a480e7 //revb z7.s,p0/m,z7.s
+ rev x21,x21
+.inst 0x05a4816b //revb z11.s,p0/m,z11.s
+.inst 0x05a481ef //revb z15.s,p0/m,z15.s
+#endif
+.if mixin == 1
+ add x29,x29,#1
+.endif
+ cmp x5,4
+ b.ne 200f
+.if mixin == 1
+ eor x7,x7,x8
+.endif
+.if mixin == 1
+ eor x9,x9,x10
+.endif
+.if mixin == 1
+ eor x11,x11,x12
+.endif
+.inst 0x05a46011 //zip1 z17.s,z0.s,z4.s
+.inst 0x05a46412 //zip2 z18.s,z0.s,z4.s
+.inst 0x05ac6113 //zip1 z19.s,z8.s,z12.s
+.inst 0x05ac6514 //zip2 z20.s,z8.s,z12.s
+
+.inst 0x05a56035 //zip1 z21.s,z1.s,z5.s
+.inst 0x05a56436 //zip2 z22.s,z1.s,z5.s
+.inst 0x05ad6137 //zip1 z23.s,z9.s,z13.s
+.inst 0x05ad6538 //zip2 z24.s,z9.s,z13.s
+
+.inst 0x05f36220 //zip1 z0.d,z17.d,z19.d
+.inst 0x05f36624 //zip2 z4.d,z17.d,z19.d
+.inst 0x05f46248 //zip1 z8.d,z18.d,z20.d
+.inst 0x05f4664c //zip2 z12.d,z18.d,z20.d
+
+.inst 0x05f762a1 //zip1 z1.d,z21.d,z23.d
+.inst 0x05f766a5 //zip2 z5.d,z21.d,z23.d
+.inst 0x05f862c9 //zip1 z9.d,z22.d,z24.d
+.inst 0x05f866cd //zip2 z13.d,z22.d,z24.d
+.if mixin == 1
+ eor x13,x13,x14
+.endif
+.if mixin == 1
+ eor x15,x15,x16
+.endif
+.if mixin == 1
+ eor x17,x17,x18
+.endif
+.inst 0x05a66051 //zip1 z17.s,z2.s,z6.s
+.inst 0x05a66452 //zip2 z18.s,z2.s,z6.s
+.inst 0x05ae6153 //zip1 z19.s,z10.s,z14.s
+.inst 0x05ae6554 //zip2 z20.s,z10.s,z14.s
+
+.inst 0x05a76075 //zip1 z21.s,z3.s,z7.s
+.inst 0x05a76476 //zip2 z22.s,z3.s,z7.s
+.inst 0x05af6177 //zip1 z23.s,z11.s,z15.s
+.inst 0x05af6578 //zip2 z24.s,z11.s,z15.s
+
+.inst 0x05f36222 //zip1 z2.d,z17.d,z19.d
+.inst 0x05f36626 //zip2 z6.d,z17.d,z19.d
+.inst 0x05f4624a //zip1 z10.d,z18.d,z20.d
+.inst 0x05f4664e //zip2 z14.d,z18.d,z20.d
+
+.inst 0x05f762a3 //zip1 z3.d,z21.d,z23.d
+.inst 0x05f766a7 //zip2 z7.d,z21.d,z23.d
+.inst 0x05f862cb //zip1 z11.d,z22.d,z24.d
+.inst 0x05f866cf //zip2 z15.d,z22.d,z24.d
+.if mixin == 1
+ eor x19,x19,x20
+.endif
+.if mixin == 1
+ eor x21,x21,x22
+.endif
+ ld1 {v17.4s,v18.4s,v19.4s,v20.4s},[x1],#64
+ ld1 {v21.4s,v22.4s,v23.4s,v24.4s},[x1],#64
+.inst 0x04b13000 //eor z0.d,z0.d,z17.d
+.inst 0x04b23021 //eor z1.d,z1.d,z18.d
+.inst 0x04b33042 //eor z2.d,z2.d,z19.d
+.inst 0x04b43063 //eor z3.d,z3.d,z20.d
+.inst 0x04b53084 //eor z4.d,z4.d,z21.d
+.inst 0x04b630a5 //eor z5.d,z5.d,z22.d
+.inst 0x04b730c6 //eor z6.d,z6.d,z23.d
+.inst 0x04b830e7 //eor z7.d,z7.d,z24.d
+ ld1 {v17.4s,v18.4s,v19.4s,v20.4s},[x1],#64
+ ld1 {v21.4s,v22.4s,v23.4s,v24.4s},[x1],#64
+.if mixin == 1
+ stp x7,x9,[x0],#16
+.endif
+.inst 0x04b13108 //eor z8.d,z8.d,z17.d
+.inst 0x04b23129 //eor z9.d,z9.d,z18.d
+.if mixin == 1
+ stp x11,x13,[x0],#16
+.endif
+.inst 0x04b3314a //eor z10.d,z10.d,z19.d
+.inst 0x04b4316b //eor z11.d,z11.d,z20.d
+.if mixin == 1
+ stp x15,x17,[x0],#16
+.endif
+.inst 0x04b5318c //eor z12.d,z12.d,z21.d
+.inst 0x04b631ad //eor z13.d,z13.d,z22.d
+.if mixin == 1
+ stp x19,x21,[x0],#16
+.endif
+.inst 0x04b731ce //eor z14.d,z14.d,z23.d
+.inst 0x04b831ef //eor z15.d,z15.d,z24.d
+ st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x0],#64
+ st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
+ st1 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64
+ st1 {v12.4s,v13.4s,v14.4s,v15.4s},[x0],#64
+ b 210f
+200:
+.inst 0x05a16011 //zip1 z17.s,z0.s,z1.s
+.inst 0x05a16412 //zip2 z18.s,z0.s,z1.s
+.inst 0x05a36053 //zip1 z19.s,z2.s,z3.s
+.inst 0x05a36454 //zip2 z20.s,z2.s,z3.s
+
+.inst 0x05a56095 //zip1 z21.s,z4.s,z5.s
+.inst 0x05a56496 //zip2 z22.s,z4.s,z5.s
+.inst 0x05a760d7 //zip1 z23.s,z6.s,z7.s
+.inst 0x05a764d8 //zip2 z24.s,z6.s,z7.s
+
+.inst 0x05f36220 //zip1 z0.d,z17.d,z19.d
+.inst 0x05f36621 //zip2 z1.d,z17.d,z19.d
+.inst 0x05f46242 //zip1 z2.d,z18.d,z20.d
+.inst 0x05f46643 //zip2 z3.d,z18.d,z20.d
+
+.inst 0x05f762a4 //zip1 z4.d,z21.d,z23.d
+.inst 0x05f766a5 //zip2 z5.d,z21.d,z23.d
+.inst 0x05f862c6 //zip1 z6.d,z22.d,z24.d
+.inst 0x05f866c7 //zip2 z7.d,z22.d,z24.d
+.if mixin == 1
+ eor x7,x7,x8
+.endif
+.if mixin == 1
+ eor x9,x9,x10
+.endif
+.inst 0x05a96111 //zip1 z17.s,z8.s,z9.s
+.inst 0x05a96512 //zip2 z18.s,z8.s,z9.s
+.inst 0x05ab6153 //zip1 z19.s,z10.s,z11.s
+.inst 0x05ab6554 //zip2 z20.s,z10.s,z11.s
+
+.inst 0x05ad6195 //zip1 z21.s,z12.s,z13.s
+.inst 0x05ad6596 //zip2 z22.s,z12.s,z13.s
+.inst 0x05af61d7 //zip1 z23.s,z14.s,z15.s
+.inst 0x05af65d8 //zip2 z24.s,z14.s,z15.s
+
+.inst 0x05f36228 //zip1 z8.d,z17.d,z19.d
+.inst 0x05f36629 //zip2 z9.d,z17.d,z19.d
+.inst 0x05f4624a //zip1 z10.d,z18.d,z20.d
+.inst 0x05f4664b //zip2 z11.d,z18.d,z20.d
+
+.inst 0x05f762ac //zip1 z12.d,z21.d,z23.d
+.inst 0x05f766ad //zip2 z13.d,z21.d,z23.d
+.inst 0x05f862ce //zip1 z14.d,z22.d,z24.d
+.inst 0x05f866cf //zip2 z15.d,z22.d,z24.d
+.if mixin == 1
+ eor x11,x11,x12
+.endif
+.if mixin == 1
+ eor x13,x13,x14
+.endif
+.inst 0x05a46011 //zip1 z17.s,z0.s,z4.s
+.inst 0x05a46412 //zip2 z18.s,z0.s,z4.s
+.inst 0x05ac6113 //zip1 z19.s,z8.s,z12.s
+.inst 0x05ac6514 //zip2 z20.s,z8.s,z12.s
+
+.inst 0x05a56035 //zip1 z21.s,z1.s,z5.s
+.inst 0x05a56436 //zip2 z22.s,z1.s,z5.s
+.inst 0x05ad6137 //zip1 z23.s,z9.s,z13.s
+.inst 0x05ad6538 //zip2 z24.s,z9.s,z13.s
+
+.inst 0x05f36220 //zip1 z0.d,z17.d,z19.d
+.inst 0x05f36624 //zip2 z4.d,z17.d,z19.d
+.inst 0x05f46248 //zip1 z8.d,z18.d,z20.d
+.inst 0x05f4664c //zip2 z12.d,z18.d,z20.d
+
+.inst 0x05f762a1 //zip1 z1.d,z21.d,z23.d
+.inst 0x05f766a5 //zip2 z5.d,z21.d,z23.d
+.inst 0x05f862c9 //zip1 z9.d,z22.d,z24.d
+.inst 0x05f866cd //zip2 z13.d,z22.d,z24.d
+.if mixin == 1
+ eor x15,x15,x16
+.endif
+.if mixin == 1
+ eor x17,x17,x18
+.endif
+.inst 0x05a66051 //zip1 z17.s,z2.s,z6.s
+.inst 0x05a66452 //zip2 z18.s,z2.s,z6.s
+.inst 0x05ae6153 //zip1 z19.s,z10.s,z14.s
+.inst 0x05ae6554 //zip2 z20.s,z10.s,z14.s
+
+.inst 0x05a76075 //zip1 z21.s,z3.s,z7.s
+.inst 0x05a76476 //zip2 z22.s,z3.s,z7.s
+.inst 0x05af6177 //zip1 z23.s,z11.s,z15.s
+.inst 0x05af6578 //zip2 z24.s,z11.s,z15.s
+
+.inst 0x05f36222 //zip1 z2.d,z17.d,z19.d
+.inst 0x05f36626 //zip2 z6.d,z17.d,z19.d
+.inst 0x05f4624a //zip1 z10.d,z18.d,z20.d
+.inst 0x05f4664e //zip2 z14.d,z18.d,z20.d
+
+.inst 0x05f762a3 //zip1 z3.d,z21.d,z23.d
+.inst 0x05f766a7 //zip2 z7.d,z21.d,z23.d
+.inst 0x05f862cb //zip1 z11.d,z22.d,z24.d
+.inst 0x05f866cf //zip2 z15.d,z22.d,z24.d
+.if mixin == 1
+ eor x19,x19,x20
+.endif
+.if mixin == 1
+ eor x21,x21,x22
+.endif
+.inst 0xa540a031 //ld1w {z17.s},p0/z,[x1,#0,MUL VL]
+.inst 0xa541a032 //ld1w {z18.s},p0/z,[x1,#1,MUL VL]
+.inst 0xa542a033 //ld1w {z19.s},p0/z,[x1,#2,MUL VL]
+.inst 0xa543a034 //ld1w {z20.s},p0/z,[x1,#3,MUL VL]
+.inst 0xa544a035 //ld1w {z21.s},p0/z,[x1,#4,MUL VL]
+.inst 0xa545a036 //ld1w {z22.s},p0/z,[x1,#5,MUL VL]
+.inst 0xa546a037 //ld1w {z23.s},p0/z,[x1,#6,MUL VL]
+.inst 0xa547a038 //ld1w {z24.s},p0/z,[x1,#7,MUL VL]
+.inst 0x04215101 //addvl x1,x1,8
+.inst 0x04b13000 //eor z0.d,z0.d,z17.d
+.inst 0x04b23084 //eor z4.d,z4.d,z18.d
+.inst 0x04b33108 //eor z8.d,z8.d,z19.d
+.inst 0x04b4318c //eor z12.d,z12.d,z20.d
+.inst 0x04b53021 //eor z1.d,z1.d,z21.d
+.inst 0x04b630a5 //eor z5.d,z5.d,z22.d
+.inst 0x04b73129 //eor z9.d,z9.d,z23.d
+.inst 0x04b831ad //eor z13.d,z13.d,z24.d
+.inst 0xa540a031 //ld1w {z17.s},p0/z,[x1,#0,MUL VL]
+.inst 0xa541a032 //ld1w {z18.s},p0/z,[x1,#1,MUL VL]
+.inst 0xa542a033 //ld1w {z19.s},p0/z,[x1,#2,MUL VL]
+.inst 0xa543a034 //ld1w {z20.s},p0/z,[x1,#3,MUL VL]
+.inst 0xa544a035 //ld1w {z21.s},p0/z,[x1,#4,MUL VL]
+.inst 0xa545a036 //ld1w {z22.s},p0/z,[x1,#5,MUL VL]
+.inst 0xa546a037 //ld1w {z23.s},p0/z,[x1,#6,MUL VL]
+.inst 0xa547a038 //ld1w {z24.s},p0/z,[x1,#7,MUL VL]
+.inst 0x04215101 //addvl x1,x1,8
+.if mixin == 1
+ stp x7,x9,[x0],#16
+.endif
+.inst 0x04b13042 //eor z2.d,z2.d,z17.d
+.inst 0x04b230c6 //eor z6.d,z6.d,z18.d
+.if mixin == 1
+ stp x11,x13,[x0],#16
+.endif
+.inst 0x04b3314a //eor z10.d,z10.d,z19.d
+.inst 0x04b431ce //eor z14.d,z14.d,z20.d
+.if mixin == 1
+ stp x15,x17,[x0],#16
+.endif
+.inst 0x04b53063 //eor z3.d,z3.d,z21.d
+.inst 0x04b630e7 //eor z7.d,z7.d,z22.d
+.if mixin == 1
+ stp x19,x21,[x0],#16
+.endif
+.inst 0x04b7316b //eor z11.d,z11.d,z23.d
+.inst 0x04b831ef //eor z15.d,z15.d,z24.d
+.inst 0xe540e000 //st1w {z0.s},p0,[x0,#0,MUL VL]
+.inst 0xe541e004 //st1w {z4.s},p0,[x0,#1,MUL VL]
+.inst 0xe542e008 //st1w {z8.s},p0,[x0,#2,MUL VL]
+.inst 0xe543e00c //st1w {z12.s},p0,[x0,#3,MUL VL]
+.inst 0xe544e001 //st1w {z1.s},p0,[x0,#4,MUL VL]
+.inst 0xe545e005 //st1w {z5.s},p0,[x0,#5,MUL VL]
+.inst 0xe546e009 //st1w {z9.s},p0,[x0,#6,MUL VL]
+.inst 0xe547e00d //st1w {z13.s},p0,[x0,#7,MUL VL]
+.inst 0x04205100 //addvl x0,x0,8
+.inst 0xe540e002 //st1w {z2.s},p0,[x0,#0,MUL VL]
+.inst 0xe541e006 //st1w {z6.s},p0,[x0,#1,MUL VL]
+.inst 0xe542e00a //st1w {z10.s},p0,[x0,#2,MUL VL]
+.inst 0xe543e00e //st1w {z14.s},p0,[x0,#3,MUL VL]
+.inst 0xe544e003 //st1w {z3.s},p0,[x0,#4,MUL VL]
+.inst 0xe545e007 //st1w {z7.s},p0,[x0,#5,MUL VL]
+.inst 0xe546e00b //st1w {z11.s},p0,[x0,#6,MUL VL]
+.inst 0xe547e00f //st1w {z15.s},p0,[x0,#7,MUL VL]
+.inst 0x04205100 //addvl x0,x0,8
+210:
+.inst 0x04b0e3fd //incw x29, ALL, MUL #1
+110:
+2:
+ str w29,[x4]
+ ldp d10,d11,[sp,16]
+ ldp d12,d13,[sp,32]
+ ldp d14,d15,[sp,48]
+ ldp x16,x17,[sp,64]
+ ldp x18,x19,[sp,80]
+ ldp x20,x21,[sp,96]
+ ldp x22,x23,[sp,112]
+ ldp x24,x25,[sp,128]
+ ldp x26,x27,[sp,144]
+ ldp x28,x29,[sp,160]
+ ldr x30,[sp,176]
+ ldp d8,d9,[sp],192
+ AARCH64_VALIDATE_LINK_REGISTER
+.Lreturn:
+ ret
+.size ChaCha20_ctr32_sve,.-ChaCha20_ctr32_sve
diff --git a/sys/crypto/openssl/aarch64/chacha-armv8.S b/sys/crypto/openssl/aarch64/chacha-armv8.S
index 4f9d6bd372f7..ee32415ad4c3 100644
--- a/sys/crypto/openssl/aarch64/chacha-armv8.S
+++ b/sys/crypto/openssl/aarch64/chacha-armv8.S
@@ -3,9 +3,11 @@
#ifndef __KERNEL__
.hidden OPENSSL_armcap_P
+
+
#endif
-.text
+.section .rodata
.align 5
.Lsigma:
@@ -17,18 +19,19 @@
.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0
.align 2
-.globl ChaCha20_ctr32
-.type ChaCha20_ctr32,%function
+.text
+
+.globl ChaCha20_ctr32_dflt
+.type ChaCha20_ctr32_dflt,%function
.align 5
-ChaCha20_ctr32:
+ChaCha20_ctr32_dflt:
AARCH64_SIGN_LINK_REGISTER
- cbz x2,.Labort
cmp x2,#192
b.lo .Lshort
-
#ifndef __KERNEL__
adrp x17,OPENSSL_armcap_P
ldr w17,[x17,#:lo12:OPENSSL_armcap_P]
+.Lcheck_neon:
tst w17,#ARMV7_NEON
b.ne .LChaCha20_neon
#endif
@@ -37,7 +40,8 @@ ChaCha20_ctr32:
stp x29,x30,[sp,#-96]!
add x29,sp,#0
- adr x5,.Lsigma
+ adrp x5,.Lsigma
+ add x5,x5,#:lo12:.Lsigma
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
stp x23,x24,[sp,#48]
@@ -304,6 +308,41 @@ ChaCha20_ctr32:
ldp x29,x30,[sp],#96
AARCH64_VALIDATE_LINK_REGISTER
ret
+.size ChaCha20_ctr32_dflt,.-ChaCha20_ctr32_dflt
+
+.globl ChaCha20_ctr32
+.type ChaCha20_ctr32,%function
+.align 5
+ChaCha20_ctr32:
+ AARCH64_SIGN_LINK_REGISTER
+ cbz x2,.Labort
+ cmp x2,#192
+ b.lo .Lshort
+#ifndef __KERNEL__
+ adrp x17,OPENSSL_armcap_P
+ ldr w17,[x17,#:lo12:OPENSSL_armcap_P]
+ tst w17,#ARMV8_SVE
+ b.eq .Lcheck_neon
+ stp x29,x30,[sp,#-16]!
+ sub sp,sp,#16
+ // SVE handling will inevitably increment the counter
+ // Neon/Scalar code that follows to process tail data needs to
+ // use new counter, unfortunately the input counter buffer
+ // pointed to by ctr is meant to be read-only per API contract
+ // we have to copy the buffer to stack to be writable by SVE
+ ldp x5,x6,[x4]
+ stp x5,x6,[sp]
+ mov x4,sp
+ bl ChaCha20_ctr32_sve
+ cbz x2,1f
+ bl ChaCha20_ctr32_dflt
+1:
+ add sp,sp,#16
+ ldp x29,x30,[sp],#16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+#endif
+ b .Lshort
.size ChaCha20_ctr32,.-ChaCha20_ctr32
#ifdef __KERNEL__
@@ -317,7 +356,8 @@ ChaCha20_neon:
stp x29,x30,[sp,#-96]!
add x29,sp,#0
- adr x5,.Lsigma
+ adrp x5,.Lsigma
+ add x5,x5,#:lo12:.Lsigma
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
stp x23,x24,[sp,#48]
@@ -890,7 +930,8 @@ ChaCha20_512_neon:
stp x29,x30,[sp,#-96]!
add x29,sp,#0
- adr x5,.Lsigma
+ adrp x5,.Lsigma
+ add x5,x5,#:lo12:.Lsigma
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
stp x23,x24,[sp,#48]
diff --git a/sys/crypto/openssl/aarch64/ecp_nistz256-armv8.S b/sys/crypto/openssl/aarch64/ecp_nistz256-armv8.S
index 73c367bcf1fc..688187ddcf43 100644
--- a/sys/crypto/openssl/aarch64/ecp_nistz256-armv8.S
+++ b/sys/crypto/openssl/aarch64/ecp_nistz256-armv8.S
@@ -1,7 +1,7 @@
/* Do not modify. This file is auto-generated from ecp_nistz256-armv8.pl. */
#include "arm_arch.h"
-.text
+.section .rodata
.globl ecp_nistz256_precomputed
.type ecp_nistz256_precomputed,%object
.align 12
@@ -2391,6 +2391,8 @@ ecp_nistz256_precomputed:
.byte 69,67,80,95,78,73,83,84,90,50,53,54,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.align 2
+.text
+
// void ecp_nistz256_to_mont(BN_ULONG x0[4],const BN_ULONG x1[4]);
.globl ecp_nistz256_to_mont
.type ecp_nistz256_to_mont,%function
@@ -2401,12 +2403,16 @@ ecp_nistz256_to_mont:
add x29,sp,#0
stp x19,x20,[sp,#16]
- ldr x3,.LRR // bp[0]
+ adrp x3,.LRR
+ ldr x3,[x3,#:lo12:.LRR] // bp[0]
ldp x4,x5,[x1]
ldp x6,x7,[x1,#16]
- ldr x12,.Lpoly+8
- ldr x13,.Lpoly+24
- adr x2,.LRR // &bp[0]
+ adrp x13,.Lpoly
+ add x13,x13,#:lo12:.Lpoly
+ ldr x12,[x13,#8]
+ ldr x13,[x13,#24]
+ adrp x2,.LRR
+ add x2,x2,#:lo12:.LRR
bl __ecp_nistz256_mul_mont
@@ -2429,9 +2435,12 @@ ecp_nistz256_from_mont:
mov x3,#1 // bp[0]
ldp x4,x5,[x1]
ldp x6,x7,[x1,#16]
- ldr x12,.Lpoly+8
- ldr x13,.Lpoly+24
- adr x2,.Lone // &bp[0]
+ adrp x13,.Lpoly
+ add x13,x13,#:lo12:.Lpoly
+ ldr x12,[x13,#8]
+ ldr x13,[x13,#24]
+ adrp x2,.Lone
+ add x2,x2,#:lo12:.Lone
bl __ecp_nistz256_mul_mont
@@ -2455,8 +2464,10 @@ ecp_nistz256_mul_mont:
ldr x3,[x2] // bp[0]
ldp x4,x5,[x1]
ldp x6,x7,[x1,#16]
- ldr x12,.Lpoly+8
- ldr x13,.Lpoly+24
+ adrp x13,.Lpoly
+ add x13,x13,#:lo12:.Lpoly
+ ldr x12,[x13,#8]
+ ldr x13,[x13,#24]
bl __ecp_nistz256_mul_mont
@@ -2478,8 +2489,10 @@ ecp_nistz256_sqr_mont:
ldp x4,x5,[x1]
ldp x6,x7,[x1,#16]
- ldr x12,.Lpoly+8
- ldr x13,.Lpoly+24
+ adrp x13,.Lpoly
+ add x13,x13,#:lo12:.Lpoly
+ ldr x12,[x13,#8]
+ ldr x13,[x13,#24]
bl __ecp_nistz256_sqr_mont
@@ -2503,8 +2516,10 @@ ecp_nistz256_add:
ldp x8,x9,[x2]
ldp x16,x17,[x1,#16]
ldp x10,x11,[x2,#16]
- ldr x12,.Lpoly+8
- ldr x13,.Lpoly+24
+ adrp x13,.Lpoly
+ add x13,x13,#:lo12:.Lpoly
+ ldr x12,[x13,#8]
+ ldr x13,[x13,#24]
bl __ecp_nistz256_add
@@ -2524,8 +2539,10 @@ ecp_nistz256_div_by_2:
ldp x14,x15,[x1]
ldp x16,x17,[x1,#16]
- ldr x12,.Lpoly+8
- ldr x13,.Lpoly+24
+ adrp x13,.Lpoly
+ add x13,x13,#:lo12:.Lpoly
+ ldr x12,[x13,#8]
+ ldr x13,[x13,#24]
bl __ecp_nistz256_div_by_2
@@ -2545,8 +2562,10 @@ ecp_nistz256_mul_by_2:
ldp x14,x15,[x1]
ldp x16,x17,[x1,#16]
- ldr x12,.Lpoly+8
- ldr x13,.Lpoly+24
+ adrp x13,.Lpoly
+ add x13,x13,#:lo12:.Lpoly
+ ldr x12,[x13,#8]
+ ldr x13,[x13,#24]
mov x8,x14
mov x9,x15
mov x10,x16
@@ -2570,8 +2589,10 @@ ecp_nistz256_mul_by_3:
ldp x14,x15,[x1]
ldp x16,x17,[x1,#16]
- ldr x12,.Lpoly+8
- ldr x13,.Lpoly+24
+ adrp x13,.Lpoly
+ add x13,x13,#:lo12:.Lpoly
+ ldr x12,[x13,#8]
+ ldr x13,[x13,#24]
mov x8,x14
mov x9,x15
mov x10,x16
@@ -2607,8 +2628,10 @@ ecp_nistz256_sub:
ldp x14,x15,[x1]
ldp x16,x17,[x1,#16]
- ldr x12,.Lpoly+8
- ldr x13,.Lpoly+24
+ adrp x13,.Lpoly
+ add x13,x13,#:lo12:.Lpoly
+ ldr x12,[x13,#8]
+ ldr x13,[x13,#24]
bl __ecp_nistz256_sub_from
@@ -2631,8 +2654,10 @@ ecp_nistz256_neg:
mov x15,xzr
mov x16,xzr
mov x17,xzr
- ldr x12,.Lpoly+8
- ldr x13,.Lpoly+24
+ adrp x13,.Lpoly
+ add x13,x13,#:lo12:.Lpoly
+ ldr x12,[x13,#8]
+ ldr x13,[x13,#24]
bl __ecp_nistz256_sub_from
@@ -3027,9 +3052,11 @@ ecp_nistz256_point_double:
mov x21,x0
ldp x16,x17,[x1,#48]
mov x22,x1
- ldr x12,.Lpoly+8
+ adrp x13,.Lpoly
+ add x13,x13,#:lo12:.Lpoly
+ ldr x12,[x13,#8]
mov x8,x14
- ldr x13,.Lpoly+24
+ ldr x13,[x13,#24]
mov x9,x15
ldp x4,x5,[x22,#64] // forward load for p256_sqr_mont
mov x10,x16
@@ -3172,8 +3199,10 @@ ecp_nistz256_point_add:
mov x21,x0
mov x22,x1
mov x23,x2
- ldr x12,.Lpoly+8
- ldr x13,.Lpoly+24
+ adrp x13,.Lpoly
+ add x13,x13,#:lo12:.Lpoly
+ ldr x12,[x13,#8]
+ ldr x13,[x13,#24]
orr x8,x4,x5
orr x10,x6,x7
orr x25,x8,x10
@@ -3423,8 +3452,10 @@ ecp_nistz256_point_add_affine:
mov x21,x0
mov x22,x1
mov x23,x2
- ldr x12,.Lpoly+8
- ldr x13,.Lpoly+24
+ adrp x13,.Lpoly
+ add x13,x13,#:lo12:.Lpoly
+ ldr x12,[x13,#8]
+ ldr x13,[x13,#24]
ldp x4,x5,[x1,#64] // in1_z
ldp x6,x7,[x1,#64+16]
@@ -3570,7 +3601,8 @@ ecp_nistz256_point_add_affine:
ldp x10,x11,[x23,#0+48]
stp x14,x15,[x21,#0]
stp x16,x17,[x21,#0+16]
- adr x23,.Lone_mont-64
+ adrp x23,.Lone_mont-64
+ add x23,x23,#:lo12:.Lone_mont-64
ldp x14,x15,[x22,#32] // in1
cmp x24,#0 // ~, remember?
ldp x16,x17,[x22,#32+16]
@@ -3628,7 +3660,8 @@ ecp_nistz256_ord_mul_mont:
stp x21,x22,[sp,#32]
stp x23,x24,[sp,#48]
- adr x23,.Lord
+ adrp x23,.Lord
+ add x23,x23,#:lo12:.Lord
ldr x3,[x2] // bp[0]
ldp x4,x5,[x1]
ldp x6,x7,[x1,#16]
@@ -3838,7 +3871,8 @@ ecp_nistz256_ord_sqr_mont:
stp x21,x22,[sp,#32]
stp x23,x24,[sp,#48]
- adr x23,.Lord
+ adrp x23,.Lord
+ add x23,x23,#:lo12:.Lord
ldp x4,x5,[x1]
ldp x6,x7,[x1,#16]
diff --git a/sys/crypto/openssl/aarch64/ecp_sm2p256-armv8.S b/sys/crypto/openssl/aarch64/ecp_sm2p256-armv8.S
new file mode 100644
index 000000000000..c9d925a7bc77
--- /dev/null
+++ b/sys/crypto/openssl/aarch64/ecp_sm2p256-armv8.S
@@ -0,0 +1,837 @@
+/* Do not modify. This file is auto-generated from ecp_sm2p256-armv8.pl. */
+#include "arm_arch.h"
+.arch armv8-a
+.section .rodata
+
+.align 5
+// The polynomial p
+.Lpoly:
+.quad 0xffffffffffffffff,0xffffffff00000000,0xffffffffffffffff,0xfffffffeffffffff
+// The order of polynomial n
+.Lord:
+.quad 0x53bbf40939d54123,0x7203df6b21c6052b,0xffffffffffffffff,0xfffffffeffffffff
+// (p + 1) / 2
+.Lpoly_div_2:
+.quad 0x8000000000000000,0xffffffff80000000,0xffffffffffffffff,0x7fffffff7fffffff
+// (n + 1) / 2
+.Lord_div_2:
+.quad 0xa9ddfa049ceaa092,0xb901efb590e30295,0xffffffffffffffff,0x7fffffff7fffffff
+
+.text
+
+// void bn_rshift1(BN_ULONG *a);
+.globl bn_rshift1
+.type bn_rshift1,%function
+.align 5
+bn_rshift1:
+ AARCH64_VALID_CALL_TARGET
+ // Load inputs
+ ldp x7,x8,[x0]
+ ldp x9,x10,[x0,#16]
+
+ // Right shift
+ extr x7,x8,x7,#1
+ extr x8,x9,x8,#1
+ extr x9,x10,x9,#1
+ lsr x10,x10,#1
+
+ // Store results
+ stp x7,x8,[x0]
+ stp x9,x10,[x0,#16]
+
+ ret
+.size bn_rshift1,.-bn_rshift1
+
+// void bn_sub(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b);
+.globl bn_sub
+.type bn_sub,%function
+.align 5
+bn_sub:
+ AARCH64_VALID_CALL_TARGET
+ // Load inputs
+ ldp x7,x8,[x1]
+ ldp x9,x10,[x1,#16]
+ ldp x11,x12,[x2]
+ ldp x13,x14,[x2,#16]
+
+ // Subtraction
+ subs x7,x7,x11
+ sbcs x8,x8,x12
+ sbcs x9,x9,x13
+ sbc x10,x10,x14
+
+ // Store results
+ stp x7,x8,[x0]
+ stp x9,x10,[x0,#16]
+
+ ret
+.size bn_sub,.-bn_sub
+
+// void ecp_sm2p256_div_by_2(BN_ULONG *r,const BN_ULONG *a);
+.globl ecp_sm2p256_div_by_2
+.type ecp_sm2p256_div_by_2,%function
+.align 5
+ecp_sm2p256_div_by_2:
+ AARCH64_VALID_CALL_TARGET
+ // Load inputs
+ ldp x7,x8,[x1]
+ ldp x9,x10,[x1,#16]
+
+ // Save the least significant bit
+ mov x3,x7
+
+ // Right shift 1
+ extr x7,x8,x7,#1
+ extr x8,x9,x8,#1
+ extr x9,x10,x9,#1
+ lsr x10,x10,#1
+
+ // Load mod
+ adrp x2,.Lpoly_div_2
+ add x2,x2,#:lo12:.Lpoly_div_2
+ ldp x11,x12,[x2]
+ ldp x13,x14,[x2,#16]
+
+ // Parity check
+ tst x3,#1
+ csel x11,xzr,x11,eq
+ csel x12,xzr,x12,eq
+ csel x13,xzr,x13,eq
+ csel x14,xzr,x14,eq
+
+ // Add
+ adds x7,x7,x11
+ adcs x8,x8,x12
+ adcs x9,x9,x13
+ adc x10,x10,x14
+
+ // Store results
+ stp x7,x8,[x0]
+ stp x9,x10,[x0,#16]
+ ret
+.size ecp_sm2p256_div_by_2,.-ecp_sm2p256_div_by_2
+
+// void ecp_sm2p256_div_by_2_mod_ord(BN_ULONG *r,const BN_ULONG *a);
+.globl ecp_sm2p256_div_by_2_mod_ord
+.type ecp_sm2p256_div_by_2_mod_ord,%function
+.align 5
+ecp_sm2p256_div_by_2_mod_ord:
+ AARCH64_VALID_CALL_TARGET
+ // Load inputs
+ ldp x7,x8,[x1]
+ ldp x9,x10,[x1,#16]
+
+ // Save the least significant bit
+ mov x3,x7
+
+ // Right shift 1
+ extr x7,x8,x7,#1
+ extr x8,x9,x8,#1
+ extr x9,x10,x9,#1
+ lsr x10,x10,#1
+
+ // Load mod
+ adrp x2,.Lord_div_2
+ add x2,x2,#:lo12:.Lord_div_2
+ ldp x11,x12,[x2]
+ ldp x13,x14,[x2,#16]
+
+ // Parity check
+ tst x3,#1
+ csel x11,xzr,x11,eq
+ csel x12,xzr,x12,eq
+ csel x13,xzr,x13,eq
+ csel x14,xzr,x14,eq
+
+ // Add
+ adds x7,x7,x11
+ adcs x8,x8,x12
+ adcs x9,x9,x13
+ adc x10,x10,x14
+
+ // Store results
+ stp x7,x8,[x0]
+ stp x9,x10,[x0,#16]
+ ret
+.size ecp_sm2p256_div_by_2_mod_ord,.-ecp_sm2p256_div_by_2_mod_ord
+
+// void ecp_sm2p256_mul_by_3(BN_ULONG *r,const BN_ULONG *a);
+.globl ecp_sm2p256_mul_by_3
+.type ecp_sm2p256_mul_by_3,%function
+.align 5
+ecp_sm2p256_mul_by_3:
+ AARCH64_VALID_CALL_TARGET
+ // Load inputs
+ ldp x7,x8,[x1]
+ ldp x9,x10,[x1,#16]
+
+ // 2*a
+ adds x7,x7,x7
+ adcs x8,x8,x8
+ adcs x9,x9,x9
+ adcs x10,x10,x10
+ adcs x15,xzr,xzr
+
+ mov x3,x7
+ mov x4,x8
+ mov x5,x9
+ mov x6,x10
+
+ // Sub polynomial
+ adrp x2,.Lpoly
+ add x2,x2,#:lo12:.Lpoly
+ ldp x11,x12,[x2]
+ ldp x13,x14,[x2,#16]
+ subs x7,x7,x11
+ sbcs x8,x8,x12
+ sbcs x9,x9,x13
+ sbcs x10,x10,x14
+ sbcs x15,x15,xzr
+
+ csel x7,x7,x3,cs
+ csel x8,x8,x4,cs
+ csel x9,x9,x5,cs
+ csel x10,x10,x6,cs
+ eor x15,x15,x15
+
+ // 3*a
+ ldp x11,x12,[x1]
+ ldp x13,x14,[x1,#16]
+ adds x7,x7,x11
+ adcs x8,x8,x12
+ adcs x9,x9,x13
+ adcs x10,x10,x14
+ adcs x15,xzr,xzr
+
+ mov x3,x7
+ mov x4,x8
+ mov x5,x9
+ mov x6,x10
+
+ // Sub polynomial
+ adrp x2,.Lpoly
+ add x2,x2,#:lo12:.Lpoly
+ ldp x11,x12,[x2]
+ ldp x13,x14,[x2,#16]
+ subs x7,x7,x11
+ sbcs x8,x8,x12
+ sbcs x9,x9,x13
+ sbcs x10,x10,x14
+ sbcs x15,x15,xzr
+
+ csel x7,x7,x3,cs
+ csel x8,x8,x4,cs
+ csel x9,x9,x5,cs
+ csel x10,x10,x6,cs
+
+ // Store results
+ stp x7,x8,[x0]
+ stp x9,x10,[x0,#16]
+
+ ret
+.size ecp_sm2p256_mul_by_3,.-ecp_sm2p256_mul_by_3
+
+// void ecp_sm2p256_add(BN_ULONG *r,const BN_ULONG *a,const BN_ULONG *b);
+.globl ecp_sm2p256_add
+.type ecp_sm2p256_add,%function
+.align 5
+ecp_sm2p256_add:
+ AARCH64_VALID_CALL_TARGET
+ // Load inputs
+ ldp x7,x8,[x1]
+ ldp x9,x10,[x1,#16]
+ ldp x11,x12,[x2]
+ ldp x13,x14,[x2,#16]
+
+ // Addition
+ adds x7,x7,x11
+ adcs x8,x8,x12
+ adcs x9,x9,x13
+ adcs x10,x10,x14
+ adc x15,xzr,xzr
+
+ // Load polynomial
+ adrp x2,.Lpoly
+ add x2,x2,#:lo12:.Lpoly
+ ldp x11,x12,[x2]
+ ldp x13,x14,[x2,#16]
+
+ // Backup Addition
+ mov x3,x7
+ mov x4,x8
+ mov x5,x9
+ mov x6,x10
+
+ // Sub polynomial
+ subs x3,x3,x11
+ sbcs x4,x4,x12
+ sbcs x5,x5,x13
+ sbcs x6,x6,x14
+ sbcs x15,x15,xzr
+
+ // Select based on carry
+ csel x7,x7,x3,cc
+ csel x8,x8,x4,cc
+ csel x9,x9,x5,cc
+ csel x10,x10,x6,cc
+
+ // Store results
+ stp x7,x8,[x0]
+ stp x9,x10,[x0,#16]
+ ret
+.size ecp_sm2p256_add,.-ecp_sm2p256_add
+
+// void ecp_sm2p256_sub(BN_ULONG *r,const BN_ULONG *a,const BN_ULONG *b);
+.globl ecp_sm2p256_sub
+.type ecp_sm2p256_sub,%function
+.align 5
+ecp_sm2p256_sub:
+ AARCH64_VALID_CALL_TARGET
+ // Load inputs
+ ldp x7,x8,[x1]
+ ldp x9,x10,[x1,#16]
+ ldp x11,x12,[x2]
+ ldp x13,x14,[x2,#16]
+
+ // Subtraction
+ subs x7,x7,x11
+ sbcs x8,x8,x12
+ sbcs x9,x9,x13
+ sbcs x10,x10,x14
+ sbc x15,xzr,xzr
+
+ // Load polynomial
+ adrp x2,.Lpoly
+ add x2,x2,#:lo12:.Lpoly
+ ldp x11,x12,[x2]
+ ldp x13,x14,[x2,#16]
+
+ // Backup subtraction
+ mov x3,x7
+ mov x4,x8
+ mov x5,x9
+ mov x6,x10
+
+ // Add polynomial
+ adds x3,x3,x11
+ adcs x4,x4,x12
+ adcs x5,x5,x13
+ adcs x6,x6,x14
+ tst x15,x15
+
+ // Select based on carry
+ csel x7,x7,x3,eq
+ csel x8,x8,x4,eq
+ csel x9,x9,x5,eq
+ csel x10,x10,x6,eq
+
+ // Store results
+ stp x7,x8,[x0]
+ stp x9,x10,[x0,#16]
+ ret
+.size ecp_sm2p256_sub,.-ecp_sm2p256_sub
+
+// void ecp_sm2p256_sub_mod_ord(BN_ULONG *r,const BN_ULONG *a,const BN_ULONG *b);
+.globl ecp_sm2p256_sub_mod_ord
+.type ecp_sm2p256_sub_mod_ord,%function
+.align 5
+ecp_sm2p256_sub_mod_ord:
+ AARCH64_VALID_CALL_TARGET
+ // Load inputs
+ ldp x7,x8,[x1]
+ ldp x9,x10,[x1,#16]
+ ldp x11,x12,[x2]
+ ldp x13,x14,[x2,#16]
+
+ // Subtraction
+ subs x7,x7,x11
+ sbcs x8,x8,x12
+ sbcs x9,x9,x13
+ sbcs x10,x10,x14
+ sbc x15,xzr,xzr
+
+ // Load polynomial
+ adrp x2,.Lord
+ add x2,x2,#:lo12:.Lord
+ ldp x11,x12,[x2]
+ ldp x13,x14,[x2,#16]
+
+ // Backup subtraction
+ mov x3,x7
+ mov x4,x8
+ mov x5,x9
+ mov x6,x10
+
+ // Add polynomial
+ adds x3,x3,x11
+ adcs x4,x4,x12
+ adcs x5,x5,x13
+ adcs x6,x6,x14
+ tst x15,x15
+
+ // Select based on carry
+ csel x7,x7,x3,eq
+ csel x8,x8,x4,eq
+ csel x9,x9,x5,eq
+ csel x10,x10,x6,eq
+
+ // Store results
+ stp x7,x8,[x0]
+ stp x9,x10,[x0,#16]
+ ret
+.size ecp_sm2p256_sub_mod_ord,.-ecp_sm2p256_sub_mod_ord
+
+.macro RDC
+ // a = | s7 | ... | s0 |, where si are 64-bit quantities
+ // = |a15|a14| ... |a1|a0|, where ai are 32-bit quantities
+ // | s7 | s6 | s5 | s4 |
+ // | a15 | a14 | a13 | a12 | a11 | a10 | a9 | a8 |
+ // | s3 | s2 | s1 | s0 |
+ // | a7 | a6 | a5 | a4 | a3 | a2 | a1 | a0 |
+ // =================================================
+ // | a8 | a11 | a10 | a9 | a8 | 0 | s4 | (+)
+ // | a9 | a15 | s6 | a11 | 0 | a10 | a9 | (+)
+ // | a10 | 0 | a14 | a13 | a12 | 0 | s5 | (+)
+ // | a11 | 0 | s7 | a13 | 0 | a12 | a11 | (+)
+ // | a12 | 0 | s7 | a13 | 0 | s6 | (+)
+ // | a12 | 0 | 0 | a15 | a14 | 0 | a14 | a13 | (+)
+ // | a13 | 0 | 0 | 0 | a15 | 0 | a14 | a13 | (+)
+ // | a13 | 0 | 0 | 0 | 0 | 0 | s7 | (+)
+ // | a14 | 0 | 0 | 0 | 0 | 0 | s7 | (+)
+ // | a14 | 0 | 0 | 0 | 0 | 0 | 0 | a15 | (+)
+ // | a15 | 0 | 0 | 0 | 0 | 0 | 0 | a15 | (+)
+ // | a15 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | (+)
+ // | s7 | 0 | 0 | 0 | 0 | 0 | 0 | (+)
+ // | 0 | 0 | 0 | 0 | 0 | a8 | 0 | 0 | (-)
+ // | 0 | 0 | 0 | 0 | 0 | a9 | 0 | 0 | (-)
+ // | 0 | 0 | 0 | 0 | 0 | a13 | 0 | 0 | (-)
+ // | 0 | 0 | 0 | 0 | 0 | a14 | 0 | 0 | (-)
+ // | U[7]| U[6]| U[5]| U[4]| U[3]| U[2]| U[1]| U[0]|
+ // | V[3] | V[2] | V[1] | V[0] |
+
+ // 1. 64-bit addition
+ // t2=s6+s7+s7
+ adds x5,x13,x14
+ adcs x4,xzr,xzr
+ adds x5,x5,x14
+ adcs x4,x4,xzr
+ // t3=s4+s5+t2
+ adds x6,x11,x5
+ adcs x15,x4,xzr
+ adds x6,x6,x12
+ adcs x15,x15,xzr
+ // sum
+ adds x7,x7,x6
+ adcs x8,x8,x15
+ adcs x9,x9,x5
+ adcs x10,x10,x14
+ adcs x3,xzr,xzr
+ adds x10,x10,x4
+ adcs x3,x3,xzr
+
+ stp x7,x8,[sp,#32]
+ stp x9,x10,[sp,#48]
+
+ // 2. 64-bit to 32-bit spread
+ mov x4,#0xffffffff
+ mov x7,x11
+ mov x8,x12
+ mov x9,x13
+ mov x10,x14
+ and x7,x7,x4 // a8
+ and x8,x8,x4 // a10
+ and x9,x9,x4 // a12
+ and x10,x10,x4 // a14
+ lsr x11,x11,#32 // a9
+ lsr x12,x12,#32 // a11
+ lsr x13,x13,#32 // a13
+ lsr x14,x14,#32 // a15
+
+ // 3. 32-bit addition
+ add x4,x10,x9 // t1 <- a12 + a14
+ add x5,x14,x13 // t2 <- a13 + a15
+ add x6,x7,x11 // t3 <- a8 + a9
+ add x15,x10,x8 // t4 <- a10 + a14
+ add x14,x14,x12 // a15 <- a11 + a15
+ add x9,x5,x4 // a12 <- a12 + a13 + a14 + a15
+ add x8,x8,x9 // a10 <- a10 + a12 + a13 + a14 + a15
+ add x8,x8,x9 // a10 <- a10 + 2*(a12 + a13 + a14 + a15)
+ add x8,x8,x6 // a10 <- a8 + a9 + a10 + 2*(a12 + a13 + a14 + a15)
+ add x8,x8,x12 // a10 <- a8 + a9 + a10 + a11 + 2*(a12 + a13 + a14 + a15)
+ add x9,x9,x13 // a12 <- a12 + 2*a13 + a14 + a15
+ add x9,x9,x12 // a12 <- a11 + a12 + 2*a13 + a14 + a15
+ add x9,x9,x7 // a12 <- a8 + a11 + a12 + 2*a13 + a14 + a15
+ add x6,x6,x10 // t3 <- a8 + a9 + a14
+ add x6,x6,x13 // t3 <- a8 + a9 + a13 + a14
+ add x11,x11,x5 // a9 <- a9 + a13 + a15
+ add x12,x12,x11 // a11 <- a9 + a11 + a13 + a15
+ add x12,x12,x5 // a11 <- a9 + a11 + 2*(a13 + a15)
+ add x4,x4,x15 // t1 <- a10 + a12 + 2*a14
+
+ // U[0] s5 a9 + a11 + 2*(a13 + a15)
+ // U[1] t1 a10 + a12 + 2*a14
+ // U[2] -t3 a8 + a9 + a13 + a14
+ // U[3] s2 a8 + a11 + a12 + 2*a13 + a14 + a15
+ // U[4] s4 a9 + a13 + a15
+ // U[5] t4 a10 + a14
+ // U[6] s7 a11 + a15
+ // U[7] s1 a8 + a9 + a10 + a11 + 2*(a12 + a13 + a14 + a15)
+
+ // 4. 32-bit to 64-bit
+ lsl x7,x4,#32
+ extr x4,x9,x4,#32
+ extr x9,x15,x9,#32
+ extr x15,x8,x15,#32
+ lsr x8,x8,#32
+
+ // 5. 64-bit addition
+ adds x12,x12,x7
+ adcs x4,x4,xzr
+ adcs x11,x11,x9
+ adcs x14,x14,x15
+ adcs x3,x3,x8
+
+ // V[0] s5
+ // V[1] t1
+ // V[2] s4
+ // V[3] s7
+ // carry t0
+ // sub t3
+
+ // 5. Process s0-s3
+ ldp x7,x8,[sp,#32]
+ ldp x9,x10,[sp,#48]
+ // add with V0-V3
+ adds x7,x7,x12
+ adcs x8,x8,x4
+ adcs x9,x9,x11
+ adcs x10,x10,x14
+ adcs x3,x3,xzr
+ // sub with t3
+ subs x8,x8,x6
+ sbcs x9,x9,xzr
+ sbcs x10,x10,xzr
+ sbcs x3,x3,xzr
+
+ // 6. MOD
+ // First Mod
+ lsl x4,x3,#32
+ subs x5,x4,x3
+
+ adds x7,x7,x3
+ adcs x8,x8,x5
+ adcs x9,x9,xzr
+ adcs x10,x10,x4
+
+ // Last Mod
+ // return y - p if y > p else y
+ mov x11,x7
+ mov x12,x8
+ mov x13,x9
+ mov x14,x10
+
+ adrp x3,.Lpoly
+ add x3,x3,#:lo12:.Lpoly
+ ldp x4,x5,[x3]
+ ldp x6,x15,[x3,#16]
+
+ adcs x16,xzr,xzr
+
+ subs x7,x7,x4
+ sbcs x8,x8,x5
+ sbcs x9,x9,x6
+ sbcs x10,x10,x15
+ sbcs x16,x16,xzr
+
+ csel x7,x7,x11,cs
+ csel x8,x8,x12,cs
+ csel x9,x9,x13,cs
+ csel x10,x10,x14,cs
+
+.endm
+
+// void ecp_sm2p256_mul(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b);
+.globl ecp_sm2p256_mul
+.type ecp_sm2p256_mul,%function
+.align 5
+ecp_sm2p256_mul:
+ AARCH64_SIGN_LINK_REGISTER
+ // Store scalar registers
+ stp x29,x30,[sp,#-80]!
+ add x29,sp,#0
+ stp x16,x17,[sp,#16]
+ stp x19,x20,[sp,#64]
+
+ // Load inputs
+ ldp x7,x8,[x1]
+ ldp x9,x10,[x1,#16]
+ ldp x11,x12,[x2]
+ ldp x13,x14,[x2,#16]
+
+// ### multiplication ###
+ // ========================
+ // s3 s2 s1 s0
+ // * s7 s6 s5 s4
+ // ------------------------
+ // + s0 s0 s0 s0
+ // * * * *
+ // s7 s6 s5 s4
+ // s1 s1 s1 s1
+ // * * * *
+ // s7 s6 s5 s4
+ // s2 s2 s2 s2
+ // * * * *
+ // s7 s6 s5 s4
+ // s3 s3 s3 s3
+ // * * * *
+ // s7 s6 s5 s4
+ // ------------------------
+ // s7 s6 s5 s4 s3 s2 s1 s0
+ // ========================
+
+// ### s0*s4 ###
+ mul x16,x7,x11
+ umulh x5,x7,x11
+
+// ### s1*s4 + s0*s5 ###
+ mul x3,x8,x11
+ umulh x4,x8,x11
+ adds x5,x5,x3
+ adcs x6,x4,xzr
+
+ mul x3,x7,x12
+ umulh x4,x7,x12
+ adds x5,x5,x3
+ adcs x6,x6,x4
+ adcs x15,xzr,xzr
+
+// ### s2*s4 + s1*s5 + s0*s6 ###
+ mul x3,x9,x11
+ umulh x4,x9,x11
+ adds x6,x6,x3
+ adcs x15,x15,x4
+
+ mul x3,x8,x12
+ umulh x4,x8,x12
+ adds x6,x6,x3
+ adcs x15,x15,x4
+ adcs x17,xzr,xzr
+
+ mul x3,x7,x13
+ umulh x4,x7,x13
+ adds x6,x6,x3
+ adcs x15,x15,x4
+ adcs x17,x17,xzr
+
+// ### s3*s4 + s2*s5 + s1*s6 + s0*s7 ###
+ mul x3,x10,x11
+ umulh x4,x10,x11
+ adds x15,x15,x3
+ adcs x17,x17,x4
+ adcs x19,xzr,xzr
+
+ mul x3,x9,x12
+ umulh x4,x9,x12
+ adds x15,x15,x3
+ adcs x17,x17,x4
+ adcs x19,x19,xzr
+
+ mul x3,x8,x13
+ umulh x4,x8,x13
+ adds x15,x15,x3
+ adcs x17,x17,x4
+ adcs x19,x19,xzr
+
+ mul x3,x7,x14
+ umulh x4,x7,x14
+ adds x15,x15,x3
+ adcs x17,x17,x4
+ adcs x19,x19,xzr
+
+// ### s3*s5 + s2*s6 + s1*s7 ###
+ mul x3,x10,x12
+ umulh x4,x10,x12
+ adds x17,x17,x3
+ adcs x19,x19,x4
+ adcs x20,xzr,xzr
+
+ mul x3,x9,x13
+ umulh x4,x9,x13
+ adds x17,x17,x3
+ adcs x19,x19,x4
+ adcs x20,x20,xzr
+
+ mul x3,x8,x14
+ umulh x4,x8,x14
+ adds x11,x17,x3
+ adcs x19,x19,x4
+ adcs x20,x20,xzr
+
+// ### s3*s6 + s2*s7 ###
+ mul x3,x10,x13
+ umulh x4,x10,x13
+ adds x19,x19,x3
+ adcs x20,x20,x4
+ adcs x17,xzr,xzr
+
+ mul x3,x9,x14
+ umulh x4,x9,x14
+ adds x12,x19,x3
+ adcs x20,x20,x4
+ adcs x17,x17,xzr
+
+// ### s3*s7 ###
+ mul x3,x10,x14
+ umulh x4,x10,x14
+ adds x13,x20,x3
+ adcs x14,x17,x4
+
+ mov x7,x16
+ mov x8,x5
+ mov x9,x6
+ mov x10,x15
+
+ // result of mul: s7 s6 s5 s4 s3 s2 s1 s0
+
+// ### Reduction ###
+ RDC
+
+ stp x7,x8,[x0]
+ stp x9,x10,[x0,#16]
+
+ // Restore scalar registers
+ ldp x16,x17,[sp,#16]
+ ldp x19,x20,[sp,#64]
+ ldp x29,x30,[sp],#80
+
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size ecp_sm2p256_mul,.-ecp_sm2p256_mul
+
+// void ecp_sm2p256_sqr(BN_ULONG *r, const BN_ULONG *a);
+.globl ecp_sm2p256_sqr
+.type ecp_sm2p256_sqr,%function
+.align 5
+
+ecp_sm2p256_sqr:
+ AARCH64_SIGN_LINK_REGISTER
+ // Store scalar registers
+ stp x29,x30,[sp,#-80]!
+ add x29,sp,#0
+ stp x16,x17,[sp,#16]
+ stp x19,x20,[sp,#64]
+
+ // Load inputs
+ ldp x11,x12,[x1]
+ ldp x13,x14,[x1,#16]
+
+// ### square ###
+ // ========================
+ // s7 s6 s5 s4
+ // * s7 s6 s5 s4
+ // ------------------------
+ // + s4 s4 s4 s4
+ // * * * *
+ // s7 s6 s5 s4
+ // s5 s5 s5 s5
+ // * * * *
+ // s7 s6 s5 s4
+ // s6 s6 s6 s6
+ // * * * *
+ // s7 s6 s5 s4
+ // s7 s7 s7 s7
+ // * * * *
+ // s7 s6 s5 s4
+ // ------------------------
+ // s7 s6 s5 s4 s3 s2 s1 s0
+ // ========================
+
+// ### s4*s5 ###
+ mul x8,x11,x12
+ umulh x9,x11,x12
+
+// ### s4*s6 ###
+ mul x3,x13,x11
+ umulh x10,x13,x11
+ adds x9,x9,x3
+ adcs x10,x10,xzr
+
+// ### s4*s7 + s5*s6 ###
+ mul x3,x14,x11
+ umulh x4,x14,x11
+ adds x10,x10,x3
+ adcs x7,x4,xzr
+
+ mul x3,x13,x12
+ umulh x4,x13,x12
+ adds x10,x10,x3
+ adcs x7,x7,x4
+ adcs x5,xzr,xzr
+
+// ### s5*s7 ###
+ mul x3,x14,x12
+ umulh x4,x14,x12
+ adds x7,x7,x3
+ adcs x5,x5,x4
+
+// ### s6*s7 ###
+ mul x3,x14,x13
+ umulh x4,x14,x13
+ adds x5,x5,x3
+ adcs x6,x4,xzr
+
+// ### 2*(t3,t2,s0,s3,s2,s1) ###
+ adds x8,x8,x8
+ adcs x9,x9,x9
+ adcs x10,x10,x10
+ adcs x7,x7,x7
+ adcs x5,x5,x5
+ adcs x6,x6,x6
+ adcs x15,xzr,xzr
+
+// ### s4*s4 ###
+ mul x16,x11,x11
+ umulh x17,x11,x11
+
+// ### s5*s5 ###
+ mul x11,x12,x12
+ umulh x12,x12,x12
+
+// ### s6*s6 ###
+ mul x3,x13,x13
+ umulh x4,x13,x13
+
+// ### s7*s7 ###
+ mul x19,x14,x14
+ umulh x20,x14,x14
+
+ adds x8,x8,x17
+ adcs x9,x9,x11
+ adcs x10,x10,x12
+ adcs x7,x7,x3
+ adcs x5,x5,x4
+ adcs x6,x6,x19
+ adcs x15,x15,x20
+
+ mov x11,x7
+ mov x7,x16
+ mov x12,x5
+ mov x13,x6
+ mov x14,x15
+
+ // result of mul: s7 s6 s5 s4 s3 s2 s1 s0
+
+// ### Reduction ###
+ RDC
+
+ stp x7,x8,[x0]
+ stp x9,x10,[x0,#16]
+
+ // Restore scalar registers
+ ldp x16,x17,[sp,#16]
+ ldp x19,x20,[sp,#64]
+ ldp x29,x30,[sp],#80
+
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size ecp_sm2p256_sqr,.-ecp_sm2p256_sqr
diff --git a/sys/crypto/openssl/aarch64/ghashv8-armx.S b/sys/crypto/openssl/aarch64/ghashv8-armx.S
index 42f053d664ef..b92c6316eae5 100644
--- a/sys/crypto/openssl/aarch64/ghashv8-armx.S
+++ b/sys/crypto/openssl/aarch64/ghashv8-armx.S
@@ -84,15 +84,103 @@ gcm_init_v8:
pmull v5.1q,v5.1d,v19.1d
eor v18.16b,v18.16b,v2.16b
eor v4.16b,v4.16b,v7.16b
- eor v20.16b, v0.16b,v18.16b //H^3
- eor v22.16b,v5.16b,v4.16b //H^4
+ eor v23.16b, v0.16b,v18.16b //H^3
+ eor v25.16b,v5.16b,v4.16b //H^4
+
+ ext v16.16b,v23.16b, v23.16b,#8 //Karatsuba pre-processing
+ ext v17.16b,v25.16b,v25.16b,#8
+ ext v18.16b,v22.16b,v22.16b,#8
+ eor v16.16b,v16.16b,v23.16b
+ eor v17.16b,v17.16b,v25.16b
+ eor v18.16b,v18.16b,v22.16b
+ ext v24.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed
+ st1 {v23.2d,v24.2d,v25.2d},[x0],#48 //store Htable[3..5]
+
+ //calculate H^5 and H^6
+ pmull v0.1q,v22.1d, v23.1d
+ pmull v5.1q,v23.1d,v23.1d
+ pmull2 v2.1q,v22.2d, v23.2d
+ pmull2 v7.1q,v23.2d,v23.2d
+ pmull v1.1q,v16.1d,v18.1d
+ pmull v6.1q,v16.1d,v16.1d
- ext v16.16b,v20.16b, v20.16b,#8 //Karatsuba pre-processing
- ext v17.16b,v22.16b,v22.16b,#8
- eor v16.16b,v16.16b,v20.16b
- eor v17.16b,v17.16b,v22.16b
- ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed
- st1 {v20.2d,v21.2d,v22.2d},[x0] //store Htable[3..5]
+ ext v16.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
+ ext v17.16b,v5.16b,v7.16b,#8
+ eor v18.16b,v0.16b,v2.16b
+ eor v1.16b,v1.16b,v16.16b
+ eor v4.16b,v5.16b,v7.16b
+ eor v6.16b,v6.16b,v17.16b
+ eor v1.16b,v1.16b,v18.16b
+ pmull v18.1q,v0.1d,v19.1d //1st phase
+ eor v6.16b,v6.16b,v4.16b
+ pmull v4.1q,v5.1d,v19.1d
+
+ ins v2.d[0],v1.d[1]
+ ins v7.d[0],v6.d[1]
+ ins v1.d[1],v0.d[0]
+ ins v6.d[1],v5.d[0]
+ eor v0.16b,v1.16b,v18.16b
+ eor v5.16b,v6.16b,v4.16b
+
+ ext v18.16b,v0.16b,v0.16b,#8 //2nd phase
+ ext v4.16b,v5.16b,v5.16b,#8
+ pmull v0.1q,v0.1d,v19.1d
+ pmull v5.1q,v5.1d,v19.1d
+ eor v18.16b,v18.16b,v2.16b
+ eor v4.16b,v4.16b,v7.16b
+ eor v26.16b,v0.16b,v18.16b //H^5
+ eor v28.16b,v5.16b,v4.16b //H^6
+
+ ext v16.16b,v26.16b, v26.16b,#8 //Karatsuba pre-processing
+ ext v17.16b,v28.16b,v28.16b,#8
+ ext v18.16b,v22.16b,v22.16b,#8
+ eor v16.16b,v16.16b,v26.16b
+ eor v17.16b,v17.16b,v28.16b
+ eor v18.16b,v18.16b,v22.16b
+ ext v27.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed
+ st1 {v26.2d,v27.2d,v28.2d},[x0],#48 //store Htable[6..8]
+
+ //calculate H^7 and H^8
+ pmull v0.1q,v22.1d,v26.1d
+ pmull v5.1q,v22.1d,v28.1d
+ pmull2 v2.1q,v22.2d,v26.2d
+ pmull2 v7.1q,v22.2d,v28.2d
+ pmull v1.1q,v16.1d,v18.1d
+ pmull v6.1q,v17.1d,v18.1d
+
+ ext v16.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
+ ext v17.16b,v5.16b,v7.16b,#8
+ eor v18.16b,v0.16b,v2.16b
+ eor v1.16b,v1.16b,v16.16b
+ eor v4.16b,v5.16b,v7.16b
+ eor v6.16b,v6.16b,v17.16b
+ eor v1.16b,v1.16b,v18.16b
+ pmull v18.1q,v0.1d,v19.1d //1st phase
+ eor v6.16b,v6.16b,v4.16b
+ pmull v4.1q,v5.1d,v19.1d
+
+ ins v2.d[0],v1.d[1]
+ ins v7.d[0],v6.d[1]
+ ins v1.d[1],v0.d[0]
+ ins v6.d[1],v5.d[0]
+ eor v0.16b,v1.16b,v18.16b
+ eor v5.16b,v6.16b,v4.16b
+
+ ext v18.16b,v0.16b,v0.16b,#8 //2nd phase
+ ext v4.16b,v5.16b,v5.16b,#8
+ pmull v0.1q,v0.1d,v19.1d
+ pmull v5.1q,v5.1d,v19.1d
+ eor v18.16b,v18.16b,v2.16b
+ eor v4.16b,v4.16b,v7.16b
+ eor v29.16b,v0.16b,v18.16b //H^7
+ eor v31.16b,v5.16b,v4.16b //H^8
+
+ ext v16.16b,v29.16b,v29.16b,#8 //Karatsuba pre-processing
+ ext v17.16b,v31.16b,v31.16b,#8
+ eor v16.16b,v16.16b,v29.16b
+ eor v17.16b,v17.16b,v31.16b
+ ext v30.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed
+ st1 {v29.2d,v30.2d,v31.2d},[x0] //store Htable[9..11]
ret
.size gcm_init_v8,.-gcm_init_v8
.globl gcm_gmult_v8
@@ -550,6 +638,7 @@ gcm_ghash_v8_4x:
ret
.size gcm_ghash_v8_4x,.-gcm_ghash_v8_4x
+.section .rodata
.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.align 2
.align 2
diff --git a/sys/crypto/openssl/aarch64/keccak1600-armv8.S b/sys/crypto/openssl/aarch64/keccak1600-armv8.S
index 08b3cc351213..e57e06f0f837 100644
--- a/sys/crypto/openssl/aarch64/keccak1600-armv8.S
+++ b/sys/crypto/openssl/aarch64/keccak1600-armv8.S
@@ -1,7 +1,7 @@
/* Do not modify. This file is auto-generated from keccak1600-armv8.pl. */
#include "arm_arch.h"
-.text
+.section .rodata
.align 8 // strategic alignment and padding that allows to use
// address value as loop termination condition...
@@ -33,11 +33,14 @@ iotas:
.quad 0x0000000080000001
.quad 0x8000000080008008
.size iotas,.-iotas
+.text
+
.type KeccakF1600_int,%function
.align 5
KeccakF1600_int:
AARCH64_SIGN_LINK_REGISTER
- adr x28,iotas
+ adrp x28,iotas
+ add x28,x28,#:lo12:iotas
stp x28,x30,[sp,#16] // 32 bytes on top are mine
b .Loop
.align 4
@@ -517,6 +520,8 @@ SHA3_squeeze:
mov x20,x1
mov x21,x2
mov x22,x3
+ cmp w4, #0 // w4 = 'next' argument
+ bne .Lnext_block
.Loop_squeeze:
ldr x4,[x0],#8
@@ -531,7 +536,7 @@ SHA3_squeeze:
subs x3,x3,#8
bhi .Loop_squeeze
-
+.Lnext_block:
mov x0,x19
bl KeccakF1600
mov x0,x19
@@ -577,7 +582,8 @@ SHA3_squeeze:
.align 5
KeccakF1600_ce:
mov x9,#24
- adr x10,iotas
+ adrp x10,iotas
+ add x10,x10,#:lo12:iotas
b .Loop_ce
.align 4
.Loop_ce:
diff --git a/sys/crypto/openssl/aarch64/md5-aarch64.S b/sys/crypto/openssl/aarch64/md5-aarch64.S
new file mode 100644
index 000000000000..88e736e49687
--- /dev/null
+++ b/sys/crypto/openssl/aarch64/md5-aarch64.S
@@ -0,0 +1,678 @@
+/* Do not modify. This file is auto-generated from md5-aarch64.pl. */
+#include "arm_arch.h"
+
+.text
+.globl ossl_md5_block_asm_data_order
+.type ossl_md5_block_asm_data_order,@function
+ossl_md5_block_asm_data_order:
+ AARCH64_VALID_CALL_TARGET
+ // Save all callee-saved registers
+ stp x19,x20,[sp,#-80]!
+ stp x21,x22,[sp,#16]
+ stp x23,x24,[sp,#32]
+ stp x25,x26,[sp,#48]
+ stp x27,x28,[sp,#64]
+
+ ldp w10, w11, [x0, #0] // .Load MD5 state->A and state->B
+ ldp w12, w13, [x0, #8] // .Load MD5 state->C and state->D
+.align 5
+ossl_md5_blocks_loop:
+ eor x17, x12, x13 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+ and x16, x17, x11 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+ ldp w15, w20, [x1] // .Load 2 words of input data0 M[0],M[1]
+ ldp w3, w21, [x1, #8] // .Load 2 words of input data0 M[2],M[3]
+#ifdef __AARCH64EB__
+ rev w15, w15
+ rev w20, w20
+ rev w3, w3
+ rev w21, w21
+#endif
+ eor x14, x16, x13 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+ movz x9, #0xa478 // .Load lower half of constant 0xd76aa478
+ movk x9, #0xd76a, lsl #16 // .Load upper half of constant 0xd76aa478
+ add w8, w10, w15 // Add dest value
+ add w7, w8, w9 // Add constant 0xd76aa478
+ add w6, w7, w14 // Add aux function result
+ ror w6, w6, #25 // Rotate left s=7 bits
+ eor x5, x11, x12 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+ add w4, w11, w6 // Add X parameter round 1 A=FF(A, B, C, D, 0xd76aa478, s=7, M[0])
+ and x8, x5, x4 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+ eor x17, x8, x12 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+ movz x16, #0xb756 // .Load lower half of constant 0xe8c7b756
+ movk x16, #0xe8c7, lsl #16 // .Load upper half of constant 0xe8c7b756
+ add w9, w13, w20 // Add dest value
+ add w7, w9, w16 // Add constant 0xe8c7b756
+ add w14, w7, w17 // Add aux function result
+ ror w14, w14, #20 // Rotate left s=12 bits
+ eor x6, x4, x11 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+ add w5, w4, w14 // Add X parameter round 1 D=FF(D, A, B, C, 0xe8c7b756, s=12, M[1])
+ and x8, x6, x5 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+ eor x9, x8, x11 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+ movz x16, #0x70db // .Load lower half of constant 0x242070db
+ movk x16, #0x2420, lsl #16 // .Load upper half of constant 0x242070db
+ add w7, w12, w3 // Add dest value
+ add w17, w7, w16 // Add constant 0x242070db
+ add w14, w17, w9 // Add aux function result
+ ror w14, w14, #15 // Rotate left s=17 bits
+ eor x6, x5, x4 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+ add w8, w5, w14 // Add X parameter round 1 C=FF(C, D, A, B, 0x242070db, s=17, M[2])
+ and x7, x6, x8 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+ eor x16, x7, x4 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+ movz x9, #0xceee // .Load lower half of constant 0xc1bdceee
+ movk x9, #0xc1bd, lsl #16 // .Load upper half of constant 0xc1bdceee
+ add w14, w11, w21 // Add dest value
+ add w6, w14, w9 // Add constant 0xc1bdceee
+ add w7, w6, w16 // Add aux function result
+ ror w7, w7, #10 // Rotate left s=22 bits
+ eor x17, x8, x5 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+ add w9, w8, w7 // Add X parameter round 1 B=FF(B, C, D, A, 0xc1bdceee, s=22, M[3])
+ ldp w14, w22, [x1, #16] // .Load 2 words of input data0 M[4],M[5]
+ ldp w7, w23, [x1, #24] // .Load 2 words of input data0 M[6],M[7]
+#ifdef __AARCH64EB__
+ rev w14, w14
+ rev w22, w22
+ rev w7, w7
+ rev w23, w23
+#endif
+ and x16, x17, x9 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+ eor x6, x16, x5 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+ movz x16, #0xfaf // .Load lower half of constant 0xf57c0faf
+ movk x16, #0xf57c, lsl #16 // .Load upper half of constant 0xf57c0faf
+ add w17, w4, w14 // Add dest value
+ add w16, w17, w16 // Add constant 0xf57c0faf
+ add w4, w16, w6 // Add aux function result
+ ror w4, w4, #25 // Rotate left s=7 bits
+ eor x16, x9, x8 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+ add w17, w9, w4 // Add X parameter round 1 A=FF(A, B, C, D, 0xf57c0faf, s=7, M[4])
+ and x16, x16, x17 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+ eor x6, x16, x8 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+ movz x4, #0xc62a // .Load lower half of constant 0x4787c62a
+ movk x4, #0x4787, lsl #16 // .Load upper half of constant 0x4787c62a
+ add w16, w5, w22 // Add dest value
+ add w16, w16, w4 // Add constant 0x4787c62a
+ add w5, w16, w6 // Add aux function result
+ ror w5, w5, #20 // Rotate left s=12 bits
+ eor x4, x17, x9 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+ add w19, w17, w5 // Add X parameter round 1 D=FF(D, A, B, C, 0x4787c62a, s=12, M[5])
+ and x6, x4, x19 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+ eor x5, x6, x9 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+ movz x4, #0x4613 // .Load lower half of constant 0xa8304613
+ movk x4, #0xa830, lsl #16 // .Load upper half of constant 0xa8304613
+ add w6, w8, w7 // Add dest value
+ add w8, w6, w4 // Add constant 0xa8304613
+ add w4, w8, w5 // Add aux function result
+ ror w4, w4, #15 // Rotate left s=17 bits
+ eor x6, x19, x17 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+ add w8, w19, w4 // Add X parameter round 1 C=FF(C, D, A, B, 0xa8304613, s=17, M[6])
+ and x5, x6, x8 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+ eor x4, x5, x17 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+ movz x6, #0x9501 // .Load lower half of constant 0xfd469501
+ movk x6, #0xfd46, lsl #16 // .Load upper half of constant 0xfd469501
+ add w9, w9, w23 // Add dest value
+ add w5, w9, w6 // Add constant 0xfd469501
+ add w9, w5, w4 // Add aux function result
+ ror w9, w9, #10 // Rotate left s=22 bits
+ eor x6, x8, x19 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+ add w4, w8, w9 // Add X parameter round 1 B=FF(B, C, D, A, 0xfd469501, s=22, M[7])
+ ldp w5, w24, [x1, #32] // .Load 2 words of input data0 M[8],M[9]
+ ldp w16, w25, [x1, #40] // .Load 2 words of input data0 M[10],M[11]
+#ifdef __AARCH64EB__
+ rev w5, w5
+ rev w24, w24
+ rev w16, w16
+ rev w25, w25
+#endif
+ and x9, x6, x4 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+ eor x6, x9, x19 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+ movz x9, #0x98d8 // .Load lower half of constant 0x698098d8
+ movk x9, #0x6980, lsl #16 // .Load upper half of constant 0x698098d8
+ add w17, w17, w5 // Add dest value
+ add w9, w17, w9 // Add constant 0x698098d8
+ add w17, w9, w6 // Add aux function result
+ ror w17, w17, #25 // Rotate left s=7 bits
+ eor x9, x4, x8 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+ add w6, w4, w17 // Add X parameter round 1 A=FF(A, B, C, D, 0x698098d8, s=7, M[8])
+ and x17, x9, x6 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+ eor x9, x17, x8 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+ movz x17, #0xf7af // .Load lower half of constant 0x8b44f7af
+ movk x17, #0x8b44, lsl #16 // .Load upper half of constant 0x8b44f7af
+ add w19, w19, w24 // Add dest value
+ add w17, w19, w17 // Add constant 0x8b44f7af
+ add w19, w17, w9 // Add aux function result
+ ror w19, w19, #20 // Rotate left s=12 bits
+ eor x9, x6, x4 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+ add w17, w6, w19 // Add X parameter round 1 D=FF(D, A, B, C, 0x8b44f7af, s=12, M[9])
+ and x9, x9, x17 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+ eor x9, x9, x4 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+ movz x11, #0x5bb1 // .Load lower half of constant 0xffff5bb1
+ movk x11, #0xffff, lsl #16 // .Load upper half of constant 0xffff5bb1
+ add w8, w8, w16 // Add dest value
+ add w8, w8, w11 // Add constant 0xffff5bb1
+ add w8, w8, w9 // Add aux function result
+ ror w8, w8, #15 // Rotate left s=17 bits
+ eor x9, x17, x6 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+ add w8, w17, w8 // Add X parameter round 1 C=FF(C, D, A, B, 0xffff5bb1, s=17, M[10])
+ and x9, x9, x8 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+ eor x9, x9, x6 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+ movz x11, #0xd7be // .Load lower half of constant 0x895cd7be
+ movk x11, #0x895c, lsl #16 // .Load upper half of constant 0x895cd7be
+ add w4, w4, w25 // Add dest value
+ add w4, w4, w11 // Add constant 0x895cd7be
+ add w9, w4, w9 // Add aux function result
+ ror w9, w9, #10 // Rotate left s=22 bits
+ eor x4, x8, x17 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+ add w9, w8, w9 // Add X parameter round 1 B=FF(B, C, D, A, 0x895cd7be, s=22, M[11])
+ ldp w11, w26, [x1, #48] // .Load 2 words of input data0 M[12],M[13]
+ ldp w12, w27, [x1, #56] // .Load 2 words of input data0 M[14],M[15]
+#ifdef __AARCH64EB__
+ rev w11, w11
+ rev w26, w26
+ rev w12, w12
+ rev w27, w27
+#endif
+ and x4, x4, x9 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+ eor x4, x4, x17 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+ movz x19, #0x1122 // .Load lower half of constant 0x6b901122
+ movk x19, #0x6b90, lsl #16 // .Load upper half of constant 0x6b901122
+ add w6, w6, w11 // Add dest value
+ add w6, w6, w19 // Add constant 0x6b901122
+ add w4, w6, w4 // Add aux function result
+ ror w4, w4, #25 // Rotate left s=7 bits
+ eor x6, x9, x8 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+ add w4, w9, w4 // Add X parameter round 1 A=FF(A, B, C, D, 0x6b901122, s=7, M[12])
+ and x6, x6, x4 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+ eor x6, x6, x8 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+ movz x19, #0x7193 // .Load lower half of constant 0xfd987193
+ movk x19, #0xfd98, lsl #16 // .Load upper half of constant 0xfd987193
+ add w17, w17, w26 // Add dest value
+ add w17, w17, w19 // Add constant 0xfd987193
+ add w17, w17, w6 // Add aux function result
+ ror w17, w17, #20 // Rotate left s=12 bits
+ eor x6, x4, x9 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+ add w17, w4, w17 // Add X parameter round 1 D=FF(D, A, B, C, 0xfd987193, s=12, M[13])
+ and x6, x6, x17 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+ eor x6, x6, x9 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+ movz x13, #0x438e // .Load lower half of constant 0xa679438e
+ movk x13, #0xa679, lsl #16 // .Load upper half of constant 0xa679438e
+ add w8, w8, w12 // Add dest value
+ add w8, w8, w13 // Add constant 0xa679438e
+ add w8, w8, w6 // Add aux function result
+ ror w8, w8, #15 // Rotate left s=17 bits
+ eor x6, x17, x4 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+ add w8, w17, w8 // Add X parameter round 1 C=FF(C, D, A, B, 0xa679438e, s=17, M[14])
+ and x6, x6, x8 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+ eor x6, x6, x4 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+ movz x13, #0x821 // .Load lower half of constant 0x49b40821
+ movk x13, #0x49b4, lsl #16 // .Load upper half of constant 0x49b40821
+ add w9, w9, w27 // Add dest value
+ add w9, w9, w13 // Add constant 0x49b40821
+ add w9, w9, w6 // Add aux function result
+ ror w9, w9, #10 // Rotate left s=22 bits
+ bic x6, x8, x17 // Aux function round 2 (~z & y)
+ add w9, w8, w9 // Add X parameter round 1 B=FF(B, C, D, A, 0x49b40821, s=22, M[15])
+ movz x13, #0x2562 // .Load lower half of constant 0xf61e2562
+ movk x13, #0xf61e, lsl #16 // .Load upper half of constant 0xf61e2562
+ add w4, w4, w20 // Add dest value
+ add w4, w4, w13 // Add constant 0xf61e2562
+ and x13, x9, x17 // Aux function round 2 (x & z)
+ add w4, w4, w6 // Add (~z & y)
+ add w4, w4, w13 // Add (x & z)
+ ror w4, w4, #27 // Rotate left s=5 bits
+ bic x6, x9, x8 // Aux function round 2 (~z & y)
+ add w4, w9, w4 // Add X parameter round 2 A=GG(A, B, C, D, 0xf61e2562, s=5, M[1])
+ movz x13, #0xb340 // .Load lower half of constant 0xc040b340
+ movk x13, #0xc040, lsl #16 // .Load upper half of constant 0xc040b340
+ add w17, w17, w7 // Add dest value
+ add w17, w17, w13 // Add constant 0xc040b340
+ and x13, x4, x8 // Aux function round 2 (x & z)
+ add w17, w17, w6 // Add (~z & y)
+ add w17, w17, w13 // Add (x & z)
+ ror w17, w17, #23 // Rotate left s=9 bits
+ bic x6, x4, x9 // Aux function round 2 (~z & y)
+ add w17, w4, w17 // Add X parameter round 2 D=GG(D, A, B, C, 0xc040b340, s=9, M[6])
+ movz x13, #0x5a51 // .Load lower half of constant 0x265e5a51
+ movk x13, #0x265e, lsl #16 // .Load upper half of constant 0x265e5a51
+ add w8, w8, w25 // Add dest value
+ add w8, w8, w13 // Add constant 0x265e5a51
+ and x13, x17, x9 // Aux function round 2 (x & z)
+ add w8, w8, w6 // Add (~z & y)
+ add w8, w8, w13 // Add (x & z)
+ ror w8, w8, #18 // Rotate left s=14 bits
+ bic x6, x17, x4 // Aux function round 2 (~z & y)
+ add w8, w17, w8 // Add X parameter round 2 C=GG(C, D, A, B, 0x265e5a51, s=14, M[11])
+ movz x13, #0xc7aa // .Load lower half of constant 0xe9b6c7aa
+ movk x13, #0xe9b6, lsl #16 // .Load upper half of constant 0xe9b6c7aa
+ add w9, w9, w15 // Add dest value
+ add w9, w9, w13 // Add constant 0xe9b6c7aa
+ and x13, x8, x4 // Aux function round 2 (x & z)
+ add w9, w9, w6 // Add (~z & y)
+ add w9, w9, w13 // Add (x & z)
+ ror w9, w9, #12 // Rotate left s=20 bits
+ bic x6, x8, x17 // Aux function round 2 (~z & y)
+ add w9, w8, w9 // Add X parameter round 2 B=GG(B, C, D, A, 0xe9b6c7aa, s=20, M[0])
+ movz x13, #0x105d // .Load lower half of constant 0xd62f105d
+ movk x13, #0xd62f, lsl #16 // .Load upper half of constant 0xd62f105d
+ add w4, w4, w22 // Add dest value
+ add w4, w4, w13 // Add constant 0xd62f105d
+ and x13, x9, x17 // Aux function round 2 (x & z)
+ add w4, w4, w6 // Add (~z & y)
+ add w4, w4, w13 // Add (x & z)
+ ror w4, w4, #27 // Rotate left s=5 bits
+ bic x6, x9, x8 // Aux function round 2 (~z & y)
+ add w4, w9, w4 // Add X parameter round 2 A=GG(A, B, C, D, 0xd62f105d, s=5, M[5])
+ movz x13, #0x1453 // .Load lower half of constant 0x2441453
+ movk x13, #0x244, lsl #16 // .Load upper half of constant 0x2441453
+ add w17, w17, w16 // Add dest value
+ add w17, w17, w13 // Add constant 0x2441453
+ and x13, x4, x8 // Aux function round 2 (x & z)
+ add w17, w17, w6 // Add (~z & y)
+ add w17, w17, w13 // Add (x & z)
+ ror w17, w17, #23 // Rotate left s=9 bits
+ bic x6, x4, x9 // Aux function round 2 (~z & y)
+ add w17, w4, w17 // Add X parameter round 2 D=GG(D, A, B, C, 0x2441453, s=9, M[10])
+ movz x13, #0xe681 // .Load lower half of constant 0xd8a1e681
+ movk x13, #0xd8a1, lsl #16 // .Load upper half of constant 0xd8a1e681
+ add w8, w8, w27 // Add dest value
+ add w8, w8, w13 // Add constant 0xd8a1e681
+ and x13, x17, x9 // Aux function round 2 (x & z)
+ add w8, w8, w6 // Add (~z & y)
+ add w8, w8, w13 // Add (x & z)
+ ror w8, w8, #18 // Rotate left s=14 bits
+ bic x6, x17, x4 // Aux function round 2 (~z & y)
+ add w8, w17, w8 // Add X parameter round 2 C=GG(C, D, A, B, 0xd8a1e681, s=14, M[15])
+ movz x13, #0xfbc8 // .Load lower half of constant 0xe7d3fbc8
+ movk x13, #0xe7d3, lsl #16 // .Load upper half of constant 0xe7d3fbc8
+ add w9, w9, w14 // Add dest value
+ add w9, w9, w13 // Add constant 0xe7d3fbc8
+ and x13, x8, x4 // Aux function round 2 (x & z)
+ add w9, w9, w6 // Add (~z & y)
+ add w9, w9, w13 // Add (x & z)
+ ror w9, w9, #12 // Rotate left s=20 bits
+ bic x6, x8, x17 // Aux function round 2 (~z & y)
+ add w9, w8, w9 // Add X parameter round 2 B=GG(B, C, D, A, 0xe7d3fbc8, s=20, M[4])
+ movz x13, #0xcde6 // .Load lower half of constant 0x21e1cde6
+ movk x13, #0x21e1, lsl #16 // .Load upper half of constant 0x21e1cde6
+ add w4, w4, w24 // Add dest value
+ add w4, w4, w13 // Add constant 0x21e1cde6
+ and x13, x9, x17 // Aux function round 2 (x & z)
+ add w4, w4, w6 // Add (~z & y)
+ add w4, w4, w13 // Add (x & z)
+ ror w4, w4, #27 // Rotate left s=5 bits
+ bic x6, x9, x8 // Aux function round 2 (~z & y)
+ add w4, w9, w4 // Add X parameter round 2 A=GG(A, B, C, D, 0x21e1cde6, s=5, M[9])
+ movz x13, #0x7d6 // .Load lower half of constant 0xc33707d6
+ movk x13, #0xc337, lsl #16 // .Load upper half of constant 0xc33707d6
+ add w17, w17, w12 // Add dest value
+ add w17, w17, w13 // Add constant 0xc33707d6
+ and x13, x4, x8 // Aux function round 2 (x & z)
+ add w17, w17, w6 // Add (~z & y)
+ add w17, w17, w13 // Add (x & z)
+ ror w17, w17, #23 // Rotate left s=9 bits
+ bic x6, x4, x9 // Aux function round 2 (~z & y)
+ add w17, w4, w17 // Add X parameter round 2 D=GG(D, A, B, C, 0xc33707d6, s=9, M[14])
+ movz x13, #0xd87 // .Load lower half of constant 0xf4d50d87
+ movk x13, #0xf4d5, lsl #16 // .Load upper half of constant 0xf4d50d87
+ add w8, w8, w21 // Add dest value
+ add w8, w8, w13 // Add constant 0xf4d50d87
+ and x13, x17, x9 // Aux function round 2 (x & z)
+ add w8, w8, w6 // Add (~z & y)
+ add w8, w8, w13 // Add (x & z)
+ ror w8, w8, #18 // Rotate left s=14 bits
+ bic x6, x17, x4 // Aux function round 2 (~z & y)
+ add w8, w17, w8 // Add X parameter round 2 C=GG(C, D, A, B, 0xf4d50d87, s=14, M[3])
+ movz x13, #0x14ed // .Load lower half of constant 0x455a14ed
+ movk x13, #0x455a, lsl #16 // .Load upper half of constant 0x455a14ed
+ add w9, w9, w5 // Add dest value
+ add w9, w9, w13 // Add constant 0x455a14ed
+ and x13, x8, x4 // Aux function round 2 (x & z)
+ add w9, w9, w6 // Add (~z & y)
+ add w9, w9, w13 // Add (x & z)
+ ror w9, w9, #12 // Rotate left s=20 bits
+ bic x6, x8, x17 // Aux function round 2 (~z & y)
+ add w9, w8, w9 // Add X parameter round 2 B=GG(B, C, D, A, 0x455a14ed, s=20, M[8])
+ movz x13, #0xe905 // .Load lower half of constant 0xa9e3e905
+ movk x13, #0xa9e3, lsl #16 // .Load upper half of constant 0xa9e3e905
+ add w4, w4, w26 // Add dest value
+ add w4, w4, w13 // Add constant 0xa9e3e905
+ and x13, x9, x17 // Aux function round 2 (x & z)
+ add w4, w4, w6 // Add (~z & y)
+ add w4, w4, w13 // Add (x & z)
+ ror w4, w4, #27 // Rotate left s=5 bits
+ bic x6, x9, x8 // Aux function round 2 (~z & y)
+ add w4, w9, w4 // Add X parameter round 2 A=GG(A, B, C, D, 0xa9e3e905, s=5, M[13])
+ movz x13, #0xa3f8 // .Load lower half of constant 0xfcefa3f8
+ movk x13, #0xfcef, lsl #16 // .Load upper half of constant 0xfcefa3f8
+ add w17, w17, w3 // Add dest value
+ add w17, w17, w13 // Add constant 0xfcefa3f8
+ and x13, x4, x8 // Aux function round 2 (x & z)
+ add w17, w17, w6 // Add (~z & y)
+ add w17, w17, w13 // Add (x & z)
+ ror w17, w17, #23 // Rotate left s=9 bits
+ bic x6, x4, x9 // Aux function round 2 (~z & y)
+ add w17, w4, w17 // Add X parameter round 2 D=GG(D, A, B, C, 0xfcefa3f8, s=9, M[2])
+ movz x13, #0x2d9 // .Load lower half of constant 0x676f02d9
+ movk x13, #0x676f, lsl #16 // .Load upper half of constant 0x676f02d9
+ add w8, w8, w23 // Add dest value
+ add w8, w8, w13 // Add constant 0x676f02d9
+ and x13, x17, x9 // Aux function round 2 (x & z)
+ add w8, w8, w6 // Add (~z & y)
+ add w8, w8, w13 // Add (x & z)
+ ror w8, w8, #18 // Rotate left s=14 bits
+ bic x6, x17, x4 // Aux function round 2 (~z & y)
+ add w8, w17, w8 // Add X parameter round 2 C=GG(C, D, A, B, 0x676f02d9, s=14, M[7])
+ movz x13, #0x4c8a // .Load lower half of constant 0x8d2a4c8a
+ movk x13, #0x8d2a, lsl #16 // .Load upper half of constant 0x8d2a4c8a
+ add w9, w9, w11 // Add dest value
+ add w9, w9, w13 // Add constant 0x8d2a4c8a
+ and x13, x8, x4 // Aux function round 2 (x & z)
+ add w9, w9, w6 // Add (~z & y)
+ add w9, w9, w13 // Add (x & z)
+ eor x6, x8, x17 // Begin aux function round 3 H(x,y,z)=(x^y^z)
+ ror w9, w9, #12 // Rotate left s=20 bits
+ movz x10, #0x3942 // .Load lower half of constant 0xfffa3942
+ add w9, w8, w9 // Add X parameter round 2 B=GG(B, C, D, A, 0x8d2a4c8a, s=20, M[12])
+ movk x10, #0xfffa, lsl #16 // .Load upper half of constant 0xfffa3942
+ add w4, w4, w22 // Add dest value
+ eor x6, x6, x9 // End aux function round 3 H(x,y,z)=(x^y^z)
+ add w4, w4, w10 // Add constant 0xfffa3942
+ add w4, w4, w6 // Add aux function result
+ ror w4, w4, #28 // Rotate left s=4 bits
+ eor x6, x9, x8 // Begin aux function round 3 H(x,y,z)=(x^y^z)
+ movz x10, #0xf681 // .Load lower half of constant 0x8771f681
+ add w4, w9, w4 // Add X parameter round 3 A=HH(A, B, C, D, 0xfffa3942, s=4, M[5])
+ movk x10, #0x8771, lsl #16 // .Load upper half of constant 0x8771f681
+ add w17, w17, w5 // Add dest value
+ eor x6, x6, x4 // End aux function round 3 H(x,y,z)=(x^y^z)
+ add w17, w17, w10 // Add constant 0x8771f681
+ add w17, w17, w6 // Add aux function result
+ eor x6, x4, x9 // Begin aux function round 3 H(x,y,z)=(x^y^z)
+ ror w17, w17, #21 // Rotate left s=11 bits
+ movz x13, #0x6122 // .Load lower half of constant 0x6d9d6122
+ add w17, w4, w17 // Add X parameter round 3 D=HH(D, A, B, C, 0x8771f681, s=11, M[8])
+ movk x13, #0x6d9d, lsl #16 // .Load upper half of constant 0x6d9d6122
+ add w8, w8, w25 // Add dest value
+ eor x6, x6, x17 // End aux function round 3 H(x,y,z)=(x^y^z)
+ add w8, w8, w13 // Add constant 0x6d9d6122
+ add w8, w8, w6 // Add aux function result
+ ror w8, w8, #16 // Rotate left s=16 bits
+ eor x6, x17, x4 // Begin aux function round 3 H(x,y,z)=(x^y^z)
+ movz x13, #0x380c // .Load lower half of constant 0xfde5380c
+ add w8, w17, w8 // Add X parameter round 3 C=HH(C, D, A, B, 0x6d9d6122, s=16, M[11])
+ movk x13, #0xfde5, lsl #16 // .Load upper half of constant 0xfde5380c
+ add w9, w9, w12 // Add dest value
+ eor x6, x6, x8 // End aux function round 3 H(x,y,z)=(x^y^z)
+ add w9, w9, w13 // Add constant 0xfde5380c
+ add w9, w9, w6 // Add aux function result
+ eor x6, x8, x17 // Begin aux function round 3 H(x,y,z)=(x^y^z)
+ ror w9, w9, #9 // Rotate left s=23 bits
+ movz x10, #0xea44 // .Load lower half of constant 0xa4beea44
+ add w9, w8, w9 // Add X parameter round 3 B=HH(B, C, D, A, 0xfde5380c, s=23, M[14])
+ movk x10, #0xa4be, lsl #16 // .Load upper half of constant 0xa4beea44
+ add w4, w4, w20 // Add dest value
+ eor x6, x6, x9 // End aux function round 3 H(x,y,z)=(x^y^z)
+ add w4, w4, w10 // Add constant 0xa4beea44
+ add w4, w4, w6 // Add aux function result
+ ror w4, w4, #28 // Rotate left s=4 bits
+ eor x6, x9, x8 // Begin aux function round 3 H(x,y,z)=(x^y^z)
+ movz x10, #0xcfa9 // .Load lower half of constant 0x4bdecfa9
+ add w4, w9, w4 // Add X parameter round 3 A=HH(A, B, C, D, 0xa4beea44, s=4, M[1])
+ movk x10, #0x4bde, lsl #16 // .Load upper half of constant 0x4bdecfa9
+ add w17, w17, w14 // Add dest value
+ eor x6, x6, x4 // End aux function round 3 H(x,y,z)=(x^y^z)
+ add w17, w17, w10 // Add constant 0x4bdecfa9
+ add w17, w17, w6 // Add aux function result
+ eor x6, x4, x9 // Begin aux function round 3 H(x,y,z)=(x^y^z)
+ ror w17, w17, #21 // Rotate left s=11 bits
+ movz x13, #0x4b60 // .Load lower half of constant 0xf6bb4b60
+ add w17, w4, w17 // Add X parameter round 3 D=HH(D, A, B, C, 0x4bdecfa9, s=11, M[4])
+ movk x13, #0xf6bb, lsl #16 // .Load upper half of constant 0xf6bb4b60
+ add w8, w8, w23 // Add dest value
+ eor x6, x6, x17 // End aux function round 3 H(x,y,z)=(x^y^z)
+ add w8, w8, w13 // Add constant 0xf6bb4b60
+ add w8, w8, w6 // Add aux function result
+ ror w8, w8, #16 // Rotate left s=16 bits
+ eor x6, x17, x4 // Begin aux function round 3 H(x,y,z)=(x^y^z)
+ movz x13, #0xbc70 // .Load lower half of constant 0xbebfbc70
+ add w8, w17, w8 // Add X parameter round 3 C=HH(C, D, A, B, 0xf6bb4b60, s=16, M[7])
+ movk x13, #0xbebf, lsl #16 // .Load upper half of constant 0xbebfbc70
+ add w9, w9, w16 // Add dest value
+ eor x6, x6, x8 // End aux function round 3 H(x,y,z)=(x^y^z)
+ add w9, w9, w13 // Add constant 0xbebfbc70
+ add w9, w9, w6 // Add aux function result
+ eor x6, x8, x17 // Begin aux function round 3 H(x,y,z)=(x^y^z)
+ ror w9, w9, #9 // Rotate left s=23 bits
+ movz x10, #0x7ec6 // .Load lower half of constant 0x289b7ec6
+ add w9, w8, w9 // Add X parameter round 3 B=HH(B, C, D, A, 0xbebfbc70, s=23, M[10])
+ movk x10, #0x289b, lsl #16 // .Load upper half of constant 0x289b7ec6
+ add w4, w4, w26 // Add dest value
+ eor x6, x6, x9 // End aux function round 3 H(x,y,z)=(x^y^z)
+ add w4, w4, w10 // Add constant 0x289b7ec6
+ add w4, w4, w6 // Add aux function result
+ ror w4, w4, #28 // Rotate left s=4 bits
+ eor x6, x9, x8 // Begin aux function round 3 H(x,y,z)=(x^y^z)
+ movz x10, #0x27fa // .Load lower half of constant 0xeaa127fa
+ add w4, w9, w4 // Add X parameter round 3 A=HH(A, B, C, D, 0x289b7ec6, s=4, M[13])
+ movk x10, #0xeaa1, lsl #16 // .Load upper half of constant 0xeaa127fa
+ add w17, w17, w15 // Add dest value
+ eor x6, x6, x4 // End aux function round 3 H(x,y,z)=(x^y^z)
+ add w17, w17, w10 // Add constant 0xeaa127fa
+ add w17, w17, w6 // Add aux function result
+ eor x6, x4, x9 // Begin aux function round 3 H(x,y,z)=(x^y^z)
+ ror w17, w17, #21 // Rotate left s=11 bits
+ movz x13, #0x3085 // .Load lower half of constant 0xd4ef3085
+ add w17, w4, w17 // Add X parameter round 3 D=HH(D, A, B, C, 0xeaa127fa, s=11, M[0])
+ movk x13, #0xd4ef, lsl #16 // .Load upper half of constant 0xd4ef3085
+ add w8, w8, w21 // Add dest value
+ eor x6, x6, x17 // End aux function round 3 H(x,y,z)=(x^y^z)
+ add w8, w8, w13 // Add constant 0xd4ef3085
+ add w8, w8, w6 // Add aux function result
+ ror w8, w8, #16 // Rotate left s=16 bits
+ eor x6, x17, x4 // Begin aux function round 3 H(x,y,z)=(x^y^z)
+ movz x13, #0x1d05 // .Load lower half of constant 0x4881d05
+ add w8, w17, w8 // Add X parameter round 3 C=HH(C, D, A, B, 0xd4ef3085, s=16, M[3])
+ movk x13, #0x488, lsl #16 // .Load upper half of constant 0x4881d05
+ add w9, w9, w7 // Add dest value
+ eor x6, x6, x8 // End aux function round 3 H(x,y,z)=(x^y^z)
+ add w9, w9, w13 // Add constant 0x4881d05
+ add w9, w9, w6 // Add aux function result
+ eor x6, x8, x17 // Begin aux function round 3 H(x,y,z)=(x^y^z)
+ ror w9, w9, #9 // Rotate left s=23 bits
+ movz x10, #0xd039 // .Load lower half of constant 0xd9d4d039
+ add w9, w8, w9 // Add X parameter round 3 B=HH(B, C, D, A, 0x4881d05, s=23, M[6])
+ movk x10, #0xd9d4, lsl #16 // .Load upper half of constant 0xd9d4d039
+ add w4, w4, w24 // Add dest value
+ eor x6, x6, x9 // End aux function round 3 H(x,y,z)=(x^y^z)
+ add w4, w4, w10 // Add constant 0xd9d4d039
+ add w4, w4, w6 // Add aux function result
+ ror w4, w4, #28 // Rotate left s=4 bits
+ eor x6, x9, x8 // Begin aux function round 3 H(x,y,z)=(x^y^z)
+ movz x10, #0x99e5 // .Load lower half of constant 0xe6db99e5
+ add w4, w9, w4 // Add X parameter round 3 A=HH(A, B, C, D, 0xd9d4d039, s=4, M[9])
+ movk x10, #0xe6db, lsl #16 // .Load upper half of constant 0xe6db99e5
+ add w17, w17, w11 // Add dest value
+ eor x6, x6, x4 // End aux function round 3 H(x,y,z)=(x^y^z)
+ add w17, w17, w10 // Add constant 0xe6db99e5
+ add w17, w17, w6 // Add aux function result
+ eor x6, x4, x9 // Begin aux function round 3 H(x,y,z)=(x^y^z)
+ ror w17, w17, #21 // Rotate left s=11 bits
+ movz x13, #0x7cf8 // .Load lower half of constant 0x1fa27cf8
+ add w17, w4, w17 // Add X parameter round 3 D=HH(D, A, B, C, 0xe6db99e5, s=11, M[12])
+ movk x13, #0x1fa2, lsl #16 // .Load upper half of constant 0x1fa27cf8
+ add w8, w8, w27 // Add dest value
+ eor x6, x6, x17 // End aux function round 3 H(x,y,z)=(x^y^z)
+ add w8, w8, w13 // Add constant 0x1fa27cf8
+ add w8, w8, w6 // Add aux function result
+ ror w8, w8, #16 // Rotate left s=16 bits
+ eor x6, x17, x4 // Begin aux function round 3 H(x,y,z)=(x^y^z)
+ movz x13, #0x5665 // .Load lower half of constant 0xc4ac5665
+ add w8, w17, w8 // Add X parameter round 3 C=HH(C, D, A, B, 0x1fa27cf8, s=16, M[15])
+ movk x13, #0xc4ac, lsl #16 // .Load upper half of constant 0xc4ac5665
+ add w9, w9, w3 // Add dest value
+ eor x6, x6, x8 // End aux function round 3 H(x,y,z)=(x^y^z)
+ add w9, w9, w13 // Add constant 0xc4ac5665
+ add w9, w9, w6 // Add aux function result
+ ror w9, w9, #9 // Rotate left s=23 bits
+ movz x6, #0x2244 // .Load lower half of constant 0xf4292244
+ movk x6, #0xf429, lsl #16 // .Load upper half of constant 0xf4292244
+ add w9, w8, w9 // Add X parameter round 3 B=HH(B, C, D, A, 0xc4ac5665, s=23, M[2])
+ add w4, w4, w15 // Add dest value
+ orn x13, x9, x17 // Begin aux function round 4 I(x,y,z)=((~z|x)^y)
+ add w4, w4, w6 // Add constant 0xf4292244
+ eor x6, x8, x13 // End aux function round 4 I(x,y,z)=((~z|x)^y)
+ add w4, w4, w6 // Add aux function result
+ ror w4, w4, #26 // Rotate left s=6 bits
+ movz x6, #0xff97 // .Load lower half of constant 0x432aff97
+ movk x6, #0x432a, lsl #16 // .Load upper half of constant 0x432aff97
+ add w4, w9, w4 // Add X parameter round 4 A=II(A, B, C, D, 0xf4292244, s=6, M[0])
+ orn x10, x4, x8 // Begin aux function round 4 I(x,y,z)=((~z|x)^y)
+ add w17, w17, w23 // Add dest value
+ eor x10, x9, x10 // End aux function round 4 I(x,y,z)=((~z|x)^y)
+ add w17, w17, w6 // Add constant 0x432aff97
+ add w6, w17, w10 // Add aux function result
+ ror w6, w6, #22 // Rotate left s=10 bits
+ movz x17, #0x23a7 // .Load lower half of constant 0xab9423a7
+ movk x17, #0xab94, lsl #16 // .Load upper half of constant 0xab9423a7
+ add w6, w4, w6 // Add X parameter round 4 D=II(D, A, B, C, 0x432aff97, s=10, M[7])
+ add w8, w8, w12 // Add dest value
+ orn x10, x6, x9 // Begin aux function round 4 I(x,y,z)=((~z|x)^y)
+ add w8, w8, w17 // Add constant 0xab9423a7
+ eor x17, x4, x10 // End aux function round 4 I(x,y,z)=((~z|x)^y)
+ add w8, w8, w17 // Add aux function result
+ ror w8, w8, #17 // Rotate left s=15 bits
+ movz x17, #0xa039 // .Load lower half of constant 0xfc93a039
+ movk x17, #0xfc93, lsl #16 // .Load upper half of constant 0xfc93a039
+ add w8, w6, w8 // Add X parameter round 4 C=II(C, D, A, B, 0xab9423a7, s=15, M[14])
+ orn x13, x8, x4 // Begin aux function round 4 I(x,y,z)=((~z|x)^y)
+ add w9, w9, w22 // Add dest value
+ eor x13, x6, x13 // End aux function round 4 I(x,y,z)=((~z|x)^y)
+ add w9, w9, w17 // Add constant 0xfc93a039
+ add w17, w9, w13 // Add aux function result
+ ror w17, w17, #11 // Rotate left s=21 bits
+ movz x9, #0x59c3 // .Load lower half of constant 0x655b59c3
+ movk x9, #0x655b, lsl #16 // .Load upper half of constant 0x655b59c3
+ add w17, w8, w17 // Add X parameter round 4 B=II(B, C, D, A, 0xfc93a039, s=21, M[5])
+ add w4, w4, w11 // Add dest value
+ orn x13, x17, x6 // Begin aux function round 4 I(x,y,z)=((~z|x)^y)
+ add w9, w4, w9 // Add constant 0x655b59c3
+ eor x4, x8, x13 // End aux function round 4 I(x,y,z)=((~z|x)^y)
+ add w9, w9, w4 // Add aux function result
+ ror w9, w9, #26 // Rotate left s=6 bits
+ movz x4, #0xcc92 // .Load lower half of constant 0x8f0ccc92
+ movk x4, #0x8f0c, lsl #16 // .Load upper half of constant 0x8f0ccc92
+ add w9, w17, w9 // Add X parameter round 4 A=II(A, B, C, D, 0x655b59c3, s=6, M[12])
+ orn x10, x9, x8 // Begin aux function round 4 I(x,y,z)=((~z|x)^y)
+ add w6, w6, w21 // Add dest value
+ eor x10, x17, x10 // End aux function round 4 I(x,y,z)=((~z|x)^y)
+ add w4, w6, w4 // Add constant 0x8f0ccc92
+ add w6, w4, w10 // Add aux function result
+ ror w6, w6, #22 // Rotate left s=10 bits
+ movz x4, #0xf47d // .Load lower half of constant 0xffeff47d
+ movk x4, #0xffef, lsl #16 // .Load upper half of constant 0xffeff47d
+ add w6, w9, w6 // Add X parameter round 4 D=II(D, A, B, C, 0x8f0ccc92, s=10, M[3])
+ add w8, w8, w16 // Add dest value
+ orn x10, x6, x17 // Begin aux function round 4 I(x,y,z)=((~z|x)^y)
+ add w8, w8, w4 // Add constant 0xffeff47d
+ eor x4, x9, x10 // End aux function round 4 I(x,y,z)=((~z|x)^y)
+ add w8, w8, w4 // Add aux function result
+ ror w8, w8, #17 // Rotate left s=15 bits
+ movz x4, #0x5dd1 // .Load lower half of constant 0x85845dd1
+ movk x4, #0x8584, lsl #16 // .Load upper half of constant 0x85845dd1
+ add w8, w6, w8 // Add X parameter round 4 C=II(C, D, A, B, 0xffeff47d, s=15, M[10])
+ orn x10, x8, x9 // Begin aux function round 4 I(x,y,z)=((~z|x)^y)
+ add w15, w17, w20 // Add dest value
+ eor x17, x6, x10 // End aux function round 4 I(x,y,z)=((~z|x)^y)
+ add w15, w15, w4 // Add constant 0x85845dd1
+ add w4, w15, w17 // Add aux function result
+ ror w4, w4, #11 // Rotate left s=21 bits
+ movz x15, #0x7e4f // .Load lower half of constant 0x6fa87e4f
+ movk x15, #0x6fa8, lsl #16 // .Load upper half of constant 0x6fa87e4f
+ add w17, w8, w4 // Add X parameter round 4 B=II(B, C, D, A, 0x85845dd1, s=21, M[1])
+ add w4, w9, w5 // Add dest value
+ orn x9, x17, x6 // Begin aux function round 4 I(x,y,z)=((~z|x)^y)
+ add w15, w4, w15 // Add constant 0x6fa87e4f
+ eor x4, x8, x9 // End aux function round 4 I(x,y,z)=((~z|x)^y)
+ add w9, w15, w4 // Add aux function result
+ ror w9, w9, #26 // Rotate left s=6 bits
+ movz x15, #0xe6e0 // .Load lower half of constant 0xfe2ce6e0
+ movk x15, #0xfe2c, lsl #16 // .Load upper half of constant 0xfe2ce6e0
+ add w4, w17, w9 // Add X parameter round 4 A=II(A, B, C, D, 0x6fa87e4f, s=6, M[8])
+ orn x9, x4, x8 // Begin aux function round 4 I(x,y,z)=((~z|x)^y)
+ add w6, w6, w27 // Add dest value
+ eor x9, x17, x9 // End aux function round 4 I(x,y,z)=((~z|x)^y)
+ add w15, w6, w15 // Add constant 0xfe2ce6e0
+ add w6, w15, w9 // Add aux function result
+ ror w6, w6, #22 // Rotate left s=10 bits
+ movz x9, #0x4314 // .Load lower half of constant 0xa3014314
+ movk x9, #0xa301, lsl #16 // .Load upper half of constant 0xa3014314
+ add w15, w4, w6 // Add X parameter round 4 D=II(D, A, B, C, 0xfe2ce6e0, s=10, M[15])
+ add w6, w8, w7 // Add dest value
+ orn x7, x15, x17 // Begin aux function round 4 I(x,y,z)=((~z|x)^y)
+ add w8, w6, w9 // Add constant 0xa3014314
+ eor x9, x4, x7 // End aux function round 4 I(x,y,z)=((~z|x)^y)
+ add w6, w8, w9 // Add aux function result
+ ror w6, w6, #17 // Rotate left s=15 bits
+ movz x7, #0x11a1 // .Load lower half of constant 0x4e0811a1
+ movk x7, #0x4e08, lsl #16 // .Load upper half of constant 0x4e0811a1
+ add w8, w15, w6 // Add X parameter round 4 C=II(C, D, A, B, 0xa3014314, s=15, M[6])
+ orn x9, x8, x4 // Begin aux function round 4 I(x,y,z)=((~z|x)^y)
+ add w6, w17, w26 // Add dest value
+ eor x17, x15, x9 // End aux function round 4 I(x,y,z)=((~z|x)^y)
+ add w9, w6, w7 // Add constant 0x4e0811a1
+ add w7, w9, w17 // Add aux function result
+ ror w7, w7, #11 // Rotate left s=21 bits
+ movz x6, #0x7e82 // .Load lower half of constant 0xf7537e82
+ movk x6, #0xf753, lsl #16 // .Load upper half of constant 0xf7537e82
+ add w9, w8, w7 // Add X parameter round 4 B=II(B, C, D, A, 0x4e0811a1, s=21, M[13])
+ add w17, w4, w14 // Add dest value
+ orn x7, x9, x15 // Begin aux function round 4 I(x,y,z)=((~z|x)^y)
+ add w14, w17, w6 // Add constant 0xf7537e82
+ eor x4, x8, x7 // End aux function round 4 I(x,y,z)=((~z|x)^y)
+ add w17, w14, w4 // Add aux function result
+ ror w17, w17, #26 // Rotate left s=6 bits
+ movz x6, #0xf235 // .Load lower half of constant 0xbd3af235
+ movk x6, #0xbd3a, lsl #16 // .Load upper half of constant 0xbd3af235
+ add w7, w9, w17 // Add X parameter round 4 A=II(A, B, C, D, 0xf7537e82, s=6, M[4])
+ orn x14, x7, x8 // Begin aux function round 4 I(x,y,z)=((~z|x)^y)
+ add w4, w15, w25 // Add dest value
+ eor x17, x9, x14 // End aux function round 4 I(x,y,z)=((~z|x)^y)
+ add w15, w4, w6 // Add constant 0xbd3af235
+ add w16, w15, w17 // Add aux function result
+ ror w16, w16, #22 // Rotate left s=10 bits
+ movz x14, #0xd2bb // .Load lower half of constant 0x2ad7d2bb
+ movk x14, #0x2ad7, lsl #16 // .Load upper half of constant 0x2ad7d2bb
+ add w4, w7, w16 // Add X parameter round 4 D=II(D, A, B, C, 0xbd3af235, s=10, M[11])
+ add w6, w8, w3 // Add dest value
+ orn x15, x4, x9 // Begin aux function round 4 I(x,y,z)=((~z|x)^y)
+ add w17, w6, w14 // Add constant 0x2ad7d2bb
+ eor x16, x7, x15 // End aux function round 4 I(x,y,z)=((~z|x)^y)
+ add w8, w17, w16 // Add aux function result
+ ror w8, w8, #17 // Rotate left s=15 bits
+ movz x3, #0xd391 // .Load lower half of constant 0xeb86d391
+ movk x3, #0xeb86, lsl #16 // .Load upper half of constant 0xeb86d391
+ add w14, w4, w8 // Add X parameter round 4 C=II(C, D, A, B, 0x2ad7d2bb, s=15, M[2])
+ orn x6, x14, x7 // Begin aux function round 4 I(x,y,z)=((~z|x)^y)
+ add w15, w9, w24 // Add dest value
+ eor x17, x4, x6 // End aux function round 4 I(x,y,z)=((~z|x)^y)
+ add w16, w15, w3 // Add constant 0xeb86d391
+ add w8, w16, w17 // Add aux function result
+ ror w8, w8, #11 // Rotate left s=21 bits
+ ldp w6, w15, [x0] // Reload MD5 state->A and state->B
+ ldp w5, w9, [x0, #8] // Reload MD5 state->C and state->D
+ add w3, w14, w8 // Add X parameter round 4 B=II(B, C, D, A, 0xeb86d391, s=21, M[9])
+ add w13, w4, w9 // Add result of MD5 rounds to state->D
+ add w12, w14, w5 // Add result of MD5 rounds to state->C
+ add w10, w7, w6 // Add result of MD5 rounds to state->A
+ add w11, w3, w15 // Add result of MD5 rounds to state->B
+ stp w12, w13, [x0, #8] // Store MD5 states C,D
+ stp w10, w11, [x0] // Store MD5 states A,B
+ add x1, x1, #64 // Increment data pointer
+ subs w2, w2, #1 // Decrement block counter
+ b.ne ossl_md5_blocks_loop
+
+ ldp x21,x22,[sp,#16]
+ ldp x23,x24,[sp,#32]
+ ldp x25,x26,[sp,#48]
+ ldp x27,x28,[sp,#64]
+ ldp x19,x20,[sp],#80
+ ret
+
diff --git a/sys/crypto/openssl/aarch64/poly1305-armv8.S b/sys/crypto/openssl/aarch64/poly1305-armv8.S
index 8925984c3ee0..3e0ccf7ff0d2 100644
--- a/sys/crypto/openssl/aarch64/poly1305-armv8.S
+++ b/sys/crypto/openssl/aarch64/poly1305-armv8.S
@@ -41,10 +41,14 @@ poly1305_init:
tst w17,#ARMV7_NEON
- adr x12,.Lpoly1305_blocks
- adr x7,.Lpoly1305_blocks_neon
- adr x13,.Lpoly1305_emit
- adr x8,.Lpoly1305_emit_neon
+ adrp x12,poly1305_blocks
+ add x12,x12,#:lo12:.Lpoly1305_blocks
+ adrp x7,poly1305_blocks_neon
+ add x7,x7,#:lo12:.Lpoly1305_blocks_neon
+ adrp x13,poly1305_emit
+ add x13,x13,#:lo12:.Lpoly1305_emit
+ adrp x8,poly1305_emit_neon
+ add x8,x8,#:lo12:.Lpoly1305_emit_neon
csel x12,x12,x7,eq
csel x13,x13,x8,eq
@@ -374,7 +378,8 @@ poly1305_blocks_neon:
ldr x30,[sp,#8]
add x16,x1,#32
- adr x17,.Lzeros
+ adrp x17,.Lzeros
+ add x17,x17,#:lo12:.Lzeros
subs x2,x2,#64
csel x16,x17,x16,lo
@@ -386,7 +391,8 @@ poly1305_blocks_neon:
.align 4
.Leven_neon:
add x16,x1,#32
- adr x17,.Lzeros
+ adrp x17,.Lzeros
+ add x17,x17,#:lo12:.Lzeros
subs x2,x2,#64
csel x16,x17,x16,lo
@@ -869,6 +875,8 @@ poly1305_emit_neon:
ret
.size poly1305_emit_neon,.-poly1305_emit_neon
+.section .rodata
+
.align 5
.Lzeros:
.long 0,0,0,0,0,0,0,0
diff --git a/sys/crypto/openssl/aarch64/sha1-armv8.S b/sys/crypto/openssl/aarch64/sha1-armv8.S
index 9e2d86072394..31627ee375a9 100644
--- a/sys/crypto/openssl/aarch64/sha1-armv8.S
+++ b/sys/crypto/openssl/aarch64/sha1-armv8.S
@@ -1081,7 +1081,8 @@ sha1_block_armv8:
stp x29,x30,[sp,#-16]!
add x29,sp,#0
- adr x4,.Lconst
+ adrp x4,.Lconst
+ add x4,x4,#:lo12:.Lconst
eor v1.16b,v1.16b,v1.16b
ld1 {v0.4s},[x0],#16
ld1 {v1.s}[0],[x0]
@@ -1204,6 +1205,9 @@ sha1_block_armv8:
ldr x29,[sp],#16
ret
.size sha1_block_armv8,.-sha1_block_armv8
+
+.section .rodata
+
.align 6
.Lconst:
.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 //K_00_19
diff --git a/sys/crypto/openssl/aarch64/sha256-armv8.S b/sys/crypto/openssl/aarch64/sha256-armv8.S
index 4f3934a4890c..0b26b4d0e92a 100644
--- a/sys/crypto/openssl/aarch64/sha256-armv8.S
+++ b/sys/crypto/openssl/aarch64/sha256-armv8.S
@@ -1,5 +1,5 @@
/* Do not modify. This file is auto-generated from sha512-armv8.pl. */
-// Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
+// Copyright 2014-2025 The OpenSSL Project Authors. All Rights Reserved.
//
// Licensed under the Apache License 2.0 (the "License"). You may not use
// this file except in compliance with the License. You can obtain a copy
@@ -93,7 +93,8 @@ sha256_block_data_order:
ldp w24,w25,[x0,#4*4]
add x2,x1,x2,lsl#6 // end of input
ldp w26,w27,[x0,#6*4]
- adr x30,.LK256
+ adrp x30,.LK256
+ add x30,x30,#:lo12:.LK256
stp x0,x2,[x29,#96]
.Loop:
@@ -1041,6 +1042,8 @@ sha256_block_data_order:
ret
.size sha256_block_data_order,.-sha256_block_data_order
+.section .rodata
+
.align 6
.type .LK256,%object
.LK256:
@@ -1065,6 +1068,8 @@ sha256_block_data_order:
.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.align 2
.align 2
+
+.text
#ifndef __KERNEL__
.type sha256_block_armv8,%function
.align 6
@@ -1075,7 +1080,8 @@ sha256_block_armv8:
add x29,sp,#0
ld1 {v0.4s,v1.4s},[x0]
- adr x3,.LK256
+ adrp x3,.LK256
+ add x3,x3,#:lo12:.LK256
.Loop_hw:
ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
@@ -1219,7 +1225,8 @@ sha256_block_neon:
mov x29, sp
sub sp,sp,#16*4
- adr x16,.LK256
+ adrp x16,.LK256
+ add x16,x16,#:lo12:.LK256
add x2,x1,x2,lsl#6 // len to point at the end of inp
ld1 {v0.16b},[x1], #16
diff --git a/sys/crypto/openssl/aarch64/sha512-armv8.S b/sys/crypto/openssl/aarch64/sha512-armv8.S
index c119d9cf5c95..d88d310020dc 100644
--- a/sys/crypto/openssl/aarch64/sha512-armv8.S
+++ b/sys/crypto/openssl/aarch64/sha512-armv8.S
@@ -1,5 +1,5 @@
/* Do not modify. This file is auto-generated from sha512-armv8.pl. */
-// Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
+// Copyright 2014-2025 The OpenSSL Project Authors. All Rights Reserved.
//
// Licensed under the Apache License 2.0 (the "License"). You may not use
// this file except in compliance with the License. You can obtain a copy
@@ -91,7 +91,8 @@ sha512_block_data_order:
ldp x24,x25,[x0,#4*8]
add x2,x1,x2,lsl#7 // end of input
ldp x26,x27,[x0,#6*8]
- adr x30,.LK512
+ adrp x30,.LK512
+ add x30,x30,#:lo12:.LK512
stp x0,x2,[x29,#96]
.Loop:
@@ -1039,6 +1040,8 @@ sha512_block_data_order:
ret
.size sha512_block_data_order,.-sha512_block_data_order
+.section .rodata
+
.align 6
.type .LK512,%object
.LK512:
@@ -1087,6 +1090,8 @@ sha512_block_data_order:
.byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.align 2
.align 2
+
+.text
#ifndef __KERNEL__
.type sha512_block_armv8,%function
.align 6
@@ -1100,7 +1105,8 @@ sha512_block_armv8:
ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
ld1 {v0.2d,v1.2d,v2.2d,v3.2d},[x0] // load context
- adr x3,.LK512
+ adrp x3,.LK512
+ add x3,x3,#:lo12:.LK512
rev64 v16.16b,v16.16b
rev64 v17.16b,v17.16b
diff --git a/sys/crypto/openssl/aarch64/sm3-armv8.S b/sys/crypto/openssl/aarch64/sm3-armv8.S
new file mode 100644
index 000000000000..08785cae9e16
--- /dev/null
+++ b/sys/crypto/openssl/aarch64/sm3-armv8.S
@@ -0,0 +1,509 @@
+/* Do not modify. This file is auto-generated from sm3-armv8.pl. */
+// Copyright 2021-2025 The OpenSSL Project Authors. All Rights Reserved.
+//
+// Licensed under the Apache License 2.0 (the "License"). You may not use
+// this file except in compliance with the License. You can obtain a copy
+// in the file LICENSE in the source distribution or at
+// https://www.openssl.org/source/license.html
+//
+// This module implements support for Armv8 SM3 instructions
+
+// $output is the last argument if it looks like a file (it has an extension)
+// $flavour is the first argument if it doesn't look like a file
+#include "arm_arch.h"
+.text
+.globl ossl_hwsm3_block_data_order
+.type ossl_hwsm3_block_data_order,%function
+.align 5
+ossl_hwsm3_block_data_order:
+ AARCH64_VALID_CALL_TARGET
+ // load state
+ ld1 {v5.4s,v6.4s}, [x0]
+ rev64 v5.4s, v5.4s
+ rev64 v6.4s, v6.4s
+ ext v5.16b, v5.16b, v5.16b, #8
+ ext v6.16b, v6.16b, v6.16b, #8
+ adrp x8, .Tj
+ add x8, x8, #:lo12:.Tj
+ ldp s16, s17, [x8]
+
+.Loop:
+ // load input
+ ld1 {v0.4s,v1.4s,v2.4s,v3.4s}, [x1], #64
+ sub w2, w2, #1
+
+ mov v18.16b, v5.16b
+ mov v19.16b, v6.16b
+
+#ifndef __AARCH64EB__
+ rev32 v0.16b, v0.16b
+ rev32 v1.16b, v1.16b
+ rev32 v2.16b, v2.16b
+ rev32 v3.16b, v3.16b
+#endif
+
+ ext v20.16b, v16.16b, v16.16b, #4
+ // s4 = w7 | w8 | w9 | w10
+ ext v4.16b, v1.16b, v2.16b, #12
+ // vtmp1 = w3 | w4 | w5 | w6
+ ext v22.16b, v0.16b, v1.16b, #12
+ // vtmp2 = w10 | w11 | w12 | w13
+ ext v23.16b, v2.16b, v3.16b, #8
+.inst 0xce63c004 //sm3partw1 v4.4s, v0.4s, v3.4s
+.inst 0xce76c6e4 //sm3partw2 v4.4s, v23.4s, v22.4s
+ eor v22.16b, v0.16b, v1.16b
+.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
+ shl v21.4s, v20.4s, #1
+ sri v21.4s, v20.4s, #31
+.inst 0xce5682e5 //sm3tt1a v5.4s, v23.4s, v22.4s[0]
+.inst 0xce408ae6 //sm3tt2a v6.4s, v23.4s, v0.4s[0]
+.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
+ shl v20.4s, v21.4s, #1
+ sri v20.4s, v21.4s, #31
+.inst 0xce5692e5 //sm3tt1a v5.4s, v23.4s, v22.4s[1]
+.inst 0xce409ae6 //sm3tt2a v6.4s, v23.4s, v0.4s[1]
+.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
+ shl v21.4s, v20.4s, #1
+ sri v21.4s, v20.4s, #31
+.inst 0xce56a2e5 //sm3tt1a v5.4s, v23.4s, v22.4s[2]
+.inst 0xce40aae6 //sm3tt2a v6.4s, v23.4s, v0.4s[2]
+.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
+ shl v20.4s, v21.4s, #1
+ sri v20.4s, v21.4s, #31
+.inst 0xce56b2e5 //sm3tt1a v5.4s, v23.4s, v22.4s[3]
+.inst 0xce40bae6 //sm3tt2a v6.4s, v23.4s, v0.4s[3]
+ // s4 = w7 | w8 | w9 | w10
+ ext v0.16b, v2.16b, v3.16b, #12
+ // vtmp1 = w3 | w4 | w5 | w6
+ ext v22.16b, v1.16b, v2.16b, #12
+ // vtmp2 = w10 | w11 | w12 | w13
+ ext v23.16b, v3.16b, v4.16b, #8
+.inst 0xce64c020 //sm3partw1 v0.4s, v1.4s, v4.4s
+.inst 0xce76c6e0 //sm3partw2 v0.4s, v23.4s, v22.4s
+ eor v22.16b, v1.16b, v2.16b
+.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
+ shl v21.4s, v20.4s, #1
+ sri v21.4s, v20.4s, #31
+.inst 0xce5682e5 //sm3tt1a v5.4s, v23.4s, v22.4s[0]
+.inst 0xce418ae6 //sm3tt2a v6.4s, v23.4s, v1.4s[0]
+.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
+ shl v20.4s, v21.4s, #1
+ sri v20.4s, v21.4s, #31
+.inst 0xce5692e5 //sm3tt1a v5.4s, v23.4s, v22.4s[1]
+.inst 0xce419ae6 //sm3tt2a v6.4s, v23.4s, v1.4s[1]
+.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
+ shl v21.4s, v20.4s, #1
+ sri v21.4s, v20.4s, #31
+.inst 0xce56a2e5 //sm3tt1a v5.4s, v23.4s, v22.4s[2]
+.inst 0xce41aae6 //sm3tt2a v6.4s, v23.4s, v1.4s[2]
+.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
+ shl v20.4s, v21.4s, #1
+ sri v20.4s, v21.4s, #31
+.inst 0xce56b2e5 //sm3tt1a v5.4s, v23.4s, v22.4s[3]
+.inst 0xce41bae6 //sm3tt2a v6.4s, v23.4s, v1.4s[3]
+ // s4 = w7 | w8 | w9 | w10
+ ext v1.16b, v3.16b, v4.16b, #12
+ // vtmp1 = w3 | w4 | w5 | w6
+ ext v22.16b, v2.16b, v3.16b, #12
+ // vtmp2 = w10 | w11 | w12 | w13
+ ext v23.16b, v4.16b, v0.16b, #8
+.inst 0xce60c041 //sm3partw1 v1.4s, v2.4s, v0.4s
+.inst 0xce76c6e1 //sm3partw2 v1.4s, v23.4s, v22.4s
+ eor v22.16b, v2.16b, v3.16b
+.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
+ shl v21.4s, v20.4s, #1
+ sri v21.4s, v20.4s, #31
+.inst 0xce5682e5 //sm3tt1a v5.4s, v23.4s, v22.4s[0]
+.inst 0xce428ae6 //sm3tt2a v6.4s, v23.4s, v2.4s[0]
+.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
+ shl v20.4s, v21.4s, #1
+ sri v20.4s, v21.4s, #31
+.inst 0xce5692e5 //sm3tt1a v5.4s, v23.4s, v22.4s[1]
+.inst 0xce429ae6 //sm3tt2a v6.4s, v23.4s, v2.4s[1]
+.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
+ shl v21.4s, v20.4s, #1
+ sri v21.4s, v20.4s, #31
+.inst 0xce56a2e5 //sm3tt1a v5.4s, v23.4s, v22.4s[2]
+.inst 0xce42aae6 //sm3tt2a v6.4s, v23.4s, v2.4s[2]
+.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
+ shl v20.4s, v21.4s, #1
+ sri v20.4s, v21.4s, #31
+.inst 0xce56b2e5 //sm3tt1a v5.4s, v23.4s, v22.4s[3]
+.inst 0xce42bae6 //sm3tt2a v6.4s, v23.4s, v2.4s[3]
+ // s4 = w7 | w8 | w9 | w10
+ ext v2.16b, v4.16b, v0.16b, #12
+ // vtmp1 = w3 | w4 | w5 | w6
+ ext v22.16b, v3.16b, v4.16b, #12
+ // vtmp2 = w10 | w11 | w12 | w13
+ ext v23.16b, v0.16b, v1.16b, #8
+.inst 0xce61c062 //sm3partw1 v2.4s, v3.4s, v1.4s
+.inst 0xce76c6e2 //sm3partw2 v2.4s, v23.4s, v22.4s
+ eor v22.16b, v3.16b, v4.16b
+.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
+ shl v21.4s, v20.4s, #1
+ sri v21.4s, v20.4s, #31
+.inst 0xce5682e5 //sm3tt1a v5.4s, v23.4s, v22.4s[0]
+.inst 0xce438ae6 //sm3tt2a v6.4s, v23.4s, v3.4s[0]
+.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
+ shl v20.4s, v21.4s, #1
+ sri v20.4s, v21.4s, #31
+.inst 0xce5692e5 //sm3tt1a v5.4s, v23.4s, v22.4s[1]
+.inst 0xce439ae6 //sm3tt2a v6.4s, v23.4s, v3.4s[1]
+.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
+ shl v21.4s, v20.4s, #1
+ sri v21.4s, v20.4s, #31
+.inst 0xce56a2e5 //sm3tt1a v5.4s, v23.4s, v22.4s[2]
+.inst 0xce43aae6 //sm3tt2a v6.4s, v23.4s, v3.4s[2]
+.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
+ shl v20.4s, v21.4s, #1
+ sri v20.4s, v21.4s, #31
+.inst 0xce56b2e5 //sm3tt1a v5.4s, v23.4s, v22.4s[3]
+.inst 0xce43bae6 //sm3tt2a v6.4s, v23.4s, v3.4s[3]
+ ext v20.16b, v17.16b, v17.16b, #4
+ // s4 = w7 | w8 | w9 | w10
+ ext v3.16b, v0.16b, v1.16b, #12
+ // vtmp1 = w3 | w4 | w5 | w6
+ ext v22.16b, v4.16b, v0.16b, #12
+ // vtmp2 = w10 | w11 | w12 | w13
+ ext v23.16b, v1.16b, v2.16b, #8
+.inst 0xce62c083 //sm3partw1 v3.4s, v4.4s, v2.4s
+.inst 0xce76c6e3 //sm3partw2 v3.4s, v23.4s, v22.4s
+ eor v22.16b, v4.16b, v0.16b
+.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
+ shl v21.4s, v20.4s, #1
+ sri v21.4s, v20.4s, #31
+.inst 0xce5686e5 //sm3tt1b v5.4s, v23.4s, v22.4s[0]
+.inst 0xce448ee6 //sm3tt2b v6.4s, v23.4s, v4.4s[0]
+.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
+ shl v20.4s, v21.4s, #1
+ sri v20.4s, v21.4s, #31
+.inst 0xce5696e5 //sm3tt1b v5.4s, v23.4s, v22.4s[1]
+.inst 0xce449ee6 //sm3tt2b v6.4s, v23.4s, v4.4s[1]
+.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
+ shl v21.4s, v20.4s, #1
+ sri v21.4s, v20.4s, #31
+.inst 0xce56a6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[2]
+.inst 0xce44aee6 //sm3tt2b v6.4s, v23.4s, v4.4s[2]
+.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
+ shl v20.4s, v21.4s, #1
+ sri v20.4s, v21.4s, #31
+.inst 0xce56b6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[3]
+.inst 0xce44bee6 //sm3tt2b v6.4s, v23.4s, v4.4s[3]
+ // s4 = w7 | w8 | w9 | w10
+ ext v4.16b, v1.16b, v2.16b, #12
+ // vtmp1 = w3 | w4 | w5 | w6
+ ext v22.16b, v0.16b, v1.16b, #12
+ // vtmp2 = w10 | w11 | w12 | w13
+ ext v23.16b, v2.16b, v3.16b, #8
+.inst 0xce63c004 //sm3partw1 v4.4s, v0.4s, v3.4s
+.inst 0xce76c6e4 //sm3partw2 v4.4s, v23.4s, v22.4s
+ eor v22.16b, v0.16b, v1.16b
+.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
+ shl v21.4s, v20.4s, #1
+ sri v21.4s, v20.4s, #31
+.inst 0xce5686e5 //sm3tt1b v5.4s, v23.4s, v22.4s[0]
+.inst 0xce408ee6 //sm3tt2b v6.4s, v23.4s, v0.4s[0]
+.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
+ shl v20.4s, v21.4s, #1
+ sri v20.4s, v21.4s, #31
+.inst 0xce5696e5 //sm3tt1b v5.4s, v23.4s, v22.4s[1]
+.inst 0xce409ee6 //sm3tt2b v6.4s, v23.4s, v0.4s[1]
+.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
+ shl v21.4s, v20.4s, #1
+ sri v21.4s, v20.4s, #31
+.inst 0xce56a6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[2]
+.inst 0xce40aee6 //sm3tt2b v6.4s, v23.4s, v0.4s[2]
+.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
+ shl v20.4s, v21.4s, #1
+ sri v20.4s, v21.4s, #31
+.inst 0xce56b6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[3]
+.inst 0xce40bee6 //sm3tt2b v6.4s, v23.4s, v0.4s[3]
+ // s4 = w7 | w8 | w9 | w10
+ ext v0.16b, v2.16b, v3.16b, #12
+ // vtmp1 = w3 | w4 | w5 | w6
+ ext v22.16b, v1.16b, v2.16b, #12
+ // vtmp2 = w10 | w11 | w12 | w13
+ ext v23.16b, v3.16b, v4.16b, #8
+.inst 0xce64c020 //sm3partw1 v0.4s, v1.4s, v4.4s
+.inst 0xce76c6e0 //sm3partw2 v0.4s, v23.4s, v22.4s
+ eor v22.16b, v1.16b, v2.16b
+.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
+ shl v21.4s, v20.4s, #1
+ sri v21.4s, v20.4s, #31
+.inst 0xce5686e5 //sm3tt1b v5.4s, v23.4s, v22.4s[0]
+.inst 0xce418ee6 //sm3tt2b v6.4s, v23.4s, v1.4s[0]
+.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
+ shl v20.4s, v21.4s, #1
+ sri v20.4s, v21.4s, #31
+.inst 0xce5696e5 //sm3tt1b v5.4s, v23.4s, v22.4s[1]
+.inst 0xce419ee6 //sm3tt2b v6.4s, v23.4s, v1.4s[1]
+.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
+ shl v21.4s, v20.4s, #1
+ sri v21.4s, v20.4s, #31
+.inst 0xce56a6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[2]
+.inst 0xce41aee6 //sm3tt2b v6.4s, v23.4s, v1.4s[2]
+.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
+ shl v20.4s, v21.4s, #1
+ sri v20.4s, v21.4s, #31
+.inst 0xce56b6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[3]
+.inst 0xce41bee6 //sm3tt2b v6.4s, v23.4s, v1.4s[3]
+ // s4 = w7 | w8 | w9 | w10
+ ext v1.16b, v3.16b, v4.16b, #12
+ // vtmp1 = w3 | w4 | w5 | w6
+ ext v22.16b, v2.16b, v3.16b, #12
+ // vtmp2 = w10 | w11 | w12 | w13
+ ext v23.16b, v4.16b, v0.16b, #8
+.inst 0xce60c041 //sm3partw1 v1.4s, v2.4s, v0.4s
+.inst 0xce76c6e1 //sm3partw2 v1.4s, v23.4s, v22.4s
+ eor v22.16b, v2.16b, v3.16b
+.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
+ shl v21.4s, v20.4s, #1
+ sri v21.4s, v20.4s, #31
+.inst 0xce5686e5 //sm3tt1b v5.4s, v23.4s, v22.4s[0]
+.inst 0xce428ee6 //sm3tt2b v6.4s, v23.4s, v2.4s[0]
+.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
+ shl v20.4s, v21.4s, #1
+ sri v20.4s, v21.4s, #31
+.inst 0xce5696e5 //sm3tt1b v5.4s, v23.4s, v22.4s[1]
+.inst 0xce429ee6 //sm3tt2b v6.4s, v23.4s, v2.4s[1]
+.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
+ shl v21.4s, v20.4s, #1
+ sri v21.4s, v20.4s, #31
+.inst 0xce56a6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[2]
+.inst 0xce42aee6 //sm3tt2b v6.4s, v23.4s, v2.4s[2]
+.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
+ shl v20.4s, v21.4s, #1
+ sri v20.4s, v21.4s, #31
+.inst 0xce56b6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[3]
+.inst 0xce42bee6 //sm3tt2b v6.4s, v23.4s, v2.4s[3]
+ // s4 = w7 | w8 | w9 | w10
+ ext v2.16b, v4.16b, v0.16b, #12
+ // vtmp1 = w3 | w4 | w5 | w6
+ ext v22.16b, v3.16b, v4.16b, #12
+ // vtmp2 = w10 | w11 | w12 | w13
+ ext v23.16b, v0.16b, v1.16b, #8
+.inst 0xce61c062 //sm3partw1 v2.4s, v3.4s, v1.4s
+.inst 0xce76c6e2 //sm3partw2 v2.4s, v23.4s, v22.4s
+ eor v22.16b, v3.16b, v4.16b
+.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
+ shl v21.4s, v20.4s, #1
+ sri v21.4s, v20.4s, #31
+.inst 0xce5686e5 //sm3tt1b v5.4s, v23.4s, v22.4s[0]
+.inst 0xce438ee6 //sm3tt2b v6.4s, v23.4s, v3.4s[0]
+.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
+ shl v20.4s, v21.4s, #1
+ sri v20.4s, v21.4s, #31
+.inst 0xce5696e5 //sm3tt1b v5.4s, v23.4s, v22.4s[1]
+.inst 0xce439ee6 //sm3tt2b v6.4s, v23.4s, v3.4s[1]
+.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
+ shl v21.4s, v20.4s, #1
+ sri v21.4s, v20.4s, #31
+.inst 0xce56a6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[2]
+.inst 0xce43aee6 //sm3tt2b v6.4s, v23.4s, v3.4s[2]
+.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
+ shl v20.4s, v21.4s, #1
+ sri v20.4s, v21.4s, #31
+.inst 0xce56b6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[3]
+.inst 0xce43bee6 //sm3tt2b v6.4s, v23.4s, v3.4s[3]
+ // s4 = w7 | w8 | w9 | w10
+ ext v3.16b, v0.16b, v1.16b, #12
+ // vtmp1 = w3 | w4 | w5 | w6
+ ext v22.16b, v4.16b, v0.16b, #12
+ // vtmp2 = w10 | w11 | w12 | w13
+ ext v23.16b, v1.16b, v2.16b, #8
+.inst 0xce62c083 //sm3partw1 v3.4s, v4.4s, v2.4s
+.inst 0xce76c6e3 //sm3partw2 v3.4s, v23.4s, v22.4s
+ eor v22.16b, v4.16b, v0.16b
+.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
+ shl v21.4s, v20.4s, #1
+ sri v21.4s, v20.4s, #31
+.inst 0xce5686e5 //sm3tt1b v5.4s, v23.4s, v22.4s[0]
+.inst 0xce448ee6 //sm3tt2b v6.4s, v23.4s, v4.4s[0]
+.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
+ shl v20.4s, v21.4s, #1
+ sri v20.4s, v21.4s, #31
+.inst 0xce5696e5 //sm3tt1b v5.4s, v23.4s, v22.4s[1]
+.inst 0xce449ee6 //sm3tt2b v6.4s, v23.4s, v4.4s[1]
+.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
+ shl v21.4s, v20.4s, #1
+ sri v21.4s, v20.4s, #31
+.inst 0xce56a6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[2]
+.inst 0xce44aee6 //sm3tt2b v6.4s, v23.4s, v4.4s[2]
+.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
+ shl v20.4s, v21.4s, #1
+ sri v20.4s, v21.4s, #31
+.inst 0xce56b6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[3]
+.inst 0xce44bee6 //sm3tt2b v6.4s, v23.4s, v4.4s[3]
+ // s4 = w7 | w8 | w9 | w10
+ ext v4.16b, v1.16b, v2.16b, #12
+ // vtmp1 = w3 | w4 | w5 | w6
+ ext v22.16b, v0.16b, v1.16b, #12
+ // vtmp2 = w10 | w11 | w12 | w13
+ ext v23.16b, v2.16b, v3.16b, #8
+.inst 0xce63c004 //sm3partw1 v4.4s, v0.4s, v3.4s
+.inst 0xce76c6e4 //sm3partw2 v4.4s, v23.4s, v22.4s
+ eor v22.16b, v0.16b, v1.16b
+.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
+ shl v21.4s, v20.4s, #1
+ sri v21.4s, v20.4s, #31
+.inst 0xce5686e5 //sm3tt1b v5.4s, v23.4s, v22.4s[0]
+.inst 0xce408ee6 //sm3tt2b v6.4s, v23.4s, v0.4s[0]
+.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
+ shl v20.4s, v21.4s, #1
+ sri v20.4s, v21.4s, #31
+.inst 0xce5696e5 //sm3tt1b v5.4s, v23.4s, v22.4s[1]
+.inst 0xce409ee6 //sm3tt2b v6.4s, v23.4s, v0.4s[1]
+.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
+ shl v21.4s, v20.4s, #1
+ sri v21.4s, v20.4s, #31
+.inst 0xce56a6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[2]
+.inst 0xce40aee6 //sm3tt2b v6.4s, v23.4s, v0.4s[2]
+.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
+ shl v20.4s, v21.4s, #1
+ sri v20.4s, v21.4s, #31
+.inst 0xce56b6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[3]
+.inst 0xce40bee6 //sm3tt2b v6.4s, v23.4s, v0.4s[3]
+ // s4 = w7 | w8 | w9 | w10
+ ext v0.16b, v2.16b, v3.16b, #12
+ // vtmp1 = w3 | w4 | w5 | w6
+ ext v22.16b, v1.16b, v2.16b, #12
+ // vtmp2 = w10 | w11 | w12 | w13
+ ext v23.16b, v3.16b, v4.16b, #8
+.inst 0xce64c020 //sm3partw1 v0.4s, v1.4s, v4.4s
+.inst 0xce76c6e0 //sm3partw2 v0.4s, v23.4s, v22.4s
+ eor v22.16b, v1.16b, v2.16b
+.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
+ shl v21.4s, v20.4s, #1
+ sri v21.4s, v20.4s, #31
+.inst 0xce5686e5 //sm3tt1b v5.4s, v23.4s, v22.4s[0]
+.inst 0xce418ee6 //sm3tt2b v6.4s, v23.4s, v1.4s[0]
+.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
+ shl v20.4s, v21.4s, #1
+ sri v20.4s, v21.4s, #31
+.inst 0xce5696e5 //sm3tt1b v5.4s, v23.4s, v22.4s[1]
+.inst 0xce419ee6 //sm3tt2b v6.4s, v23.4s, v1.4s[1]
+.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
+ shl v21.4s, v20.4s, #1
+ sri v21.4s, v20.4s, #31
+.inst 0xce56a6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[2]
+.inst 0xce41aee6 //sm3tt2b v6.4s, v23.4s, v1.4s[2]
+.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
+ shl v20.4s, v21.4s, #1
+ sri v20.4s, v21.4s, #31
+.inst 0xce56b6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[3]
+.inst 0xce41bee6 //sm3tt2b v6.4s, v23.4s, v1.4s[3]
+ // s4 = w7 | w8 | w9 | w10
+ ext v1.16b, v3.16b, v4.16b, #12
+ // vtmp1 = w3 | w4 | w5 | w6
+ ext v22.16b, v2.16b, v3.16b, #12
+ // vtmp2 = w10 | w11 | w12 | w13
+ ext v23.16b, v4.16b, v0.16b, #8
+.inst 0xce60c041 //sm3partw1 v1.4s, v2.4s, v0.4s
+.inst 0xce76c6e1 //sm3partw2 v1.4s, v23.4s, v22.4s
+ eor v22.16b, v2.16b, v3.16b
+.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
+ shl v21.4s, v20.4s, #1
+ sri v21.4s, v20.4s, #31
+.inst 0xce5686e5 //sm3tt1b v5.4s, v23.4s, v22.4s[0]
+.inst 0xce428ee6 //sm3tt2b v6.4s, v23.4s, v2.4s[0]
+.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
+ shl v20.4s, v21.4s, #1
+ sri v20.4s, v21.4s, #31
+.inst 0xce5696e5 //sm3tt1b v5.4s, v23.4s, v22.4s[1]
+.inst 0xce429ee6 //sm3tt2b v6.4s, v23.4s, v2.4s[1]
+.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
+ shl v21.4s, v20.4s, #1
+ sri v21.4s, v20.4s, #31
+.inst 0xce56a6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[2]
+.inst 0xce42aee6 //sm3tt2b v6.4s, v23.4s, v2.4s[2]
+.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
+ shl v20.4s, v21.4s, #1
+ sri v20.4s, v21.4s, #31
+.inst 0xce56b6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[3]
+.inst 0xce42bee6 //sm3tt2b v6.4s, v23.4s, v2.4s[3]
+ eor v22.16b, v3.16b, v4.16b
+.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
+ shl v21.4s, v20.4s, #1
+ sri v21.4s, v20.4s, #31
+.inst 0xce5686e5 //sm3tt1b v5.4s, v23.4s, v22.4s[0]
+.inst 0xce438ee6 //sm3tt2b v6.4s, v23.4s, v3.4s[0]
+.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
+ shl v20.4s, v21.4s, #1
+ sri v20.4s, v21.4s, #31
+.inst 0xce5696e5 //sm3tt1b v5.4s, v23.4s, v22.4s[1]
+.inst 0xce439ee6 //sm3tt2b v6.4s, v23.4s, v3.4s[1]
+.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
+ shl v21.4s, v20.4s, #1
+ sri v21.4s, v20.4s, #31
+.inst 0xce56a6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[2]
+.inst 0xce43aee6 //sm3tt2b v6.4s, v23.4s, v3.4s[2]
+.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
+ shl v20.4s, v21.4s, #1
+ sri v20.4s, v21.4s, #31
+.inst 0xce56b6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[3]
+.inst 0xce43bee6 //sm3tt2b v6.4s, v23.4s, v3.4s[3]
+ eor v22.16b, v4.16b, v0.16b
+.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
+ shl v21.4s, v20.4s, #1
+ sri v21.4s, v20.4s, #31
+.inst 0xce5686e5 //sm3tt1b v5.4s, v23.4s, v22.4s[0]
+.inst 0xce448ee6 //sm3tt2b v6.4s, v23.4s, v4.4s[0]
+.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
+ shl v20.4s, v21.4s, #1
+ sri v20.4s, v21.4s, #31
+.inst 0xce5696e5 //sm3tt1b v5.4s, v23.4s, v22.4s[1]
+.inst 0xce449ee6 //sm3tt2b v6.4s, v23.4s, v4.4s[1]
+.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
+ shl v21.4s, v20.4s, #1
+ sri v21.4s, v20.4s, #31
+.inst 0xce56a6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[2]
+.inst 0xce44aee6 //sm3tt2b v6.4s, v23.4s, v4.4s[2]
+.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
+ shl v20.4s, v21.4s, #1
+ sri v20.4s, v21.4s, #31
+.inst 0xce56b6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[3]
+.inst 0xce44bee6 //sm3tt2b v6.4s, v23.4s, v4.4s[3]
+ eor v22.16b, v0.16b, v1.16b
+.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
+ shl v21.4s, v20.4s, #1
+ sri v21.4s, v20.4s, #31
+.inst 0xce5686e5 //sm3tt1b v5.4s, v23.4s, v22.4s[0]
+.inst 0xce408ee6 //sm3tt2b v6.4s, v23.4s, v0.4s[0]
+.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
+ shl v20.4s, v21.4s, #1
+ sri v20.4s, v21.4s, #31
+.inst 0xce5696e5 //sm3tt1b v5.4s, v23.4s, v22.4s[1]
+.inst 0xce409ee6 //sm3tt2b v6.4s, v23.4s, v0.4s[1]
+.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
+ shl v21.4s, v20.4s, #1
+ sri v21.4s, v20.4s, #31
+.inst 0xce56a6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[2]
+.inst 0xce40aee6 //sm3tt2b v6.4s, v23.4s, v0.4s[2]
+.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
+ shl v20.4s, v21.4s, #1
+ sri v20.4s, v21.4s, #31
+.inst 0xce56b6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[3]
+.inst 0xce40bee6 //sm3tt2b v6.4s, v23.4s, v0.4s[3]
+ eor v5.16b, v5.16b, v18.16b
+ eor v6.16b, v6.16b, v19.16b
+
+ // any remained blocks?
+ cbnz w2, .Loop
+
+ // save state
+ rev64 v5.4s, v5.4s
+ rev64 v6.4s, v6.4s
+ ext v5.16b, v5.16b, v5.16b, #8
+ ext v6.16b, v6.16b, v6.16b, #8
+ st1 {v5.4s,v6.4s}, [x0]
+ ret
+.size ossl_hwsm3_block_data_order,.-ossl_hwsm3_block_data_order
+.section .rodata
+
+.type _sm3_consts,%object
+.align 3
+_sm3_consts:
+.Tj:
+.word 0x79cc4519, 0x9d8a7a87
+.size _sm3_consts,.-_sm3_consts
+.previous
diff --git a/sys/crypto/openssl/aarch64/sm4-armv8.S b/sys/crypto/openssl/aarch64/sm4-armv8.S
new file mode 100644
index 000000000000..4d3aa3cd70b3
--- /dev/null
+++ b/sys/crypto/openssl/aarch64/sm4-armv8.S
@@ -0,0 +1,1093 @@
+/* Do not modify. This file is auto-generated from sm4-armv8.pl. */
+// Copyright 2022-2025 The OpenSSL Project Authors. All Rights Reserved.
+//
+// Licensed under the Apache License 2.0 (the "License"). You may not use
+// this file except in compliance with the License. You can obtain a copy
+// in the file LICENSE in the source distribution or at
+// https://www.openssl.org/source/license.html
+
+//
+// This module implements support for SM4 hw support on aarch64
+// Oct 2021
+//
+
+// $output is the last argument if it looks like a file (it has an extension)
+// $flavour is the first argument if it doesn't look like a file
+#include "arm_arch.h"
+.arch armv8-a+crypto
+.text
+
+.section .rodata
+.type _sm4_v8_consts,%object
+.align 6
+_sm4_v8_consts:
+.Lck:
+.long 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269
+.long 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9
+.long 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249
+.long 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9
+.long 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229
+.long 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299
+.long 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209
+.long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279
+.Lfk:
+.long 0xa3b1bac6, 0x56aa3350, 0x677d9197, 0xb27022dc
+.size _sm4_v8_consts,.-_sm4_v8_consts
+.previous
+
+.globl sm4_v8_set_encrypt_key
+.type sm4_v8_set_encrypt_key,%function
+.align 5
+sm4_v8_set_encrypt_key:
+ AARCH64_VALID_CALL_TARGET
+ ld1 {v0.4s},[x0]
+ adrp x2,.Lfk
+ add x2,x2,#:lo12:.Lfk
+ ld1 {v24.4s},[x2]
+ adrp x2,.Lck
+ add x2,x2,#:lo12:.Lck
+ ld1 {v16.4s,v17.4s,v18.4s,v19.4s},[x2],64
+#ifndef __AARCH64EB__
+ rev32 v0.16b,v0.16b
+#endif
+ ld1 {v20.4s,v21.4s,v22.4s,v23.4s},[x2]
+ eor v0.16b,v0.16b,v24.16b;
+.inst 0xce70c800 //sm4ekey v0.4S,v0.4S,v16.4S
+.inst 0xce71c801 //sm4ekey v1.4S,v0.4S,v17.4S
+.inst 0xce72c822 //sm4ekey v2.4S,v1.4S,v18.4S
+.inst 0xce73c843 //sm4ekey v3.4S,v2.4S,v19.4S
+.inst 0xce74c864 //sm4ekey v4.4S,v3.4S,v20.4S
+ st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],64
+.inst 0xce75c885 //sm4ekey v5.4S,v4.4S,v21.4S
+.inst 0xce76c8a6 //sm4ekey v6.4S,v5.4S,v22.4S
+.inst 0xce77c8c7 //sm4ekey v7.4S,v6.4S,v23.4S
+ st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x1]
+ ret
+.size sm4_v8_set_encrypt_key,.-sm4_v8_set_encrypt_key
+.globl sm4_v8_set_decrypt_key
+.type sm4_v8_set_decrypt_key,%function
+.align 5
+sm4_v8_set_decrypt_key:
+ AARCH64_VALID_CALL_TARGET
+ ld1 {v7.4s},[x0]
+ adrp x2,.Lfk
+ add x2,x2,#:lo12:.Lfk
+ ld1 {v24.4s},[x2]
+ adrp x2,.Lck
+ add x2,x2,#:lo12:.Lck
+ ld1 {v16.4s,v17.4s,v18.4s,v19.4s},[x2],64
+#ifndef __AARCH64EB__
+ rev32 v7.16b,v7.16b
+#endif
+ ld1 {v20.4s,v21.4s,v22.4s,v23.4s},[x2]
+ eor v7.16b, v7.16b,v24.16b;
+.inst 0xce70c8e7 //sm4ekey v7.4S,v7.4S,v16.4S
+.inst 0xce71c8e6 //sm4ekey v6.4S,v7.4S,v17.4S
+.inst 0xce72c8c5 //sm4ekey v5.4S,v6.4S,v18.4S
+ rev64 v7.4s,v7.4s
+ rev64 v6.4s,v6.4s
+ ext v7.16b,v7.16b,v7.16b,#8
+ ext v6.16b,v6.16b,v6.16b,#8
+.inst 0xce73c8a4 //sm4ekey v4.4S,v5.4S,v19.4S
+.inst 0xce74c883 //sm4ekey v3.4S,v4.4S,v20.4S
+ rev64 v5.4s,v5.4s
+ rev64 v4.4s,v4.4s
+ ext v5.16b,v5.16b,v5.16b,#8
+ ext v4.16b,v4.16b,v4.16b,#8
+.inst 0xce75c862 //sm4ekey v2.4S,v3.4S,v21.4S
+.inst 0xce76c841 //sm4ekey v1.4S,v2.4S,v22.4S
+ rev64 v3.4s,v3.4s
+ rev64 v2.4s,v2.4s
+ ext v3.16b,v3.16b,v3.16b,#8
+ ext v2.16b,v2.16b,v2.16b,#8
+.inst 0xce77c820 //sm4ekey v0.4S,v1.4S,v23.4S
+ rev64 v1.4s, v1.4s
+ rev64 v0.4s, v0.4s
+ ext v1.16b,v1.16b,v1.16b,#8
+ ext v0.16b,v0.16b,v0.16b,#8
+ st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],64
+ st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x1]
+ ret
+.size sm4_v8_set_decrypt_key,.-sm4_v8_set_decrypt_key
+.globl sm4_v8_encrypt
+.type sm4_v8_encrypt,%function
+.align 5
+sm4_v8_encrypt:
+ AARCH64_VALID_CALL_TARGET
+ ld1 {v16.4s},[x0]
+ ld1 {v0.4s,v1.4s,v2.4s,v3.4s},[x2],64
+ ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x2]
+#ifndef __AARCH64EB__
+ rev32 v16.16b,v16.16b
+#endif
+.inst 0xcec08410 //sm4e v16.4s,v0.4s
+.inst 0xcec08430 //sm4e v16.4s,v1.4s
+.inst 0xcec08450 //sm4e v16.4s,v2.4s
+.inst 0xcec08470 //sm4e v16.4s,v3.4s
+.inst 0xcec08490 //sm4e v16.4s,v4.4s
+.inst 0xcec084b0 //sm4e v16.4s,v5.4s
+.inst 0xcec084d0 //sm4e v16.4s,v6.4s
+.inst 0xcec084f0 //sm4e v16.4s,v7.4s
+ rev64 v16.4S,v16.4S
+ ext v16.16b,v16.16b,v16.16b,#8
+#ifndef __AARCH64EB__
+ rev32 v16.16b,v16.16b
+#endif
+ st1 {v16.4s},[x1]
+ ret
+.size sm4_v8_encrypt,.-sm4_v8_encrypt
+.globl sm4_v8_decrypt
+.type sm4_v8_decrypt,%function
+.align 5
+sm4_v8_decrypt:
+ AARCH64_VALID_CALL_TARGET
+ ld1 {v16.4s},[x0]
+ ld1 {v0.4s,v1.4s,v2.4s,v3.4s},[x2],64
+ ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x2]
+#ifndef __AARCH64EB__
+ rev32 v16.16b,v16.16b
+#endif
+.inst 0xcec08410 //sm4e v16.4s,v0.4s
+.inst 0xcec08430 //sm4e v16.4s,v1.4s
+.inst 0xcec08450 //sm4e v16.4s,v2.4s
+.inst 0xcec08470 //sm4e v16.4s,v3.4s
+.inst 0xcec08490 //sm4e v16.4s,v4.4s
+.inst 0xcec084b0 //sm4e v16.4s,v5.4s
+.inst 0xcec084d0 //sm4e v16.4s,v6.4s
+.inst 0xcec084f0 //sm4e v16.4s,v7.4s
+ rev64 v16.4S,v16.4S
+ ext v16.16b,v16.16b,v16.16b,#8
+#ifndef __AARCH64EB__
+ rev32 v16.16b,v16.16b
+#endif
+ st1 {v16.4s},[x1]
+ ret
+.size sm4_v8_decrypt,.-sm4_v8_decrypt
+.globl sm4_v8_ecb_encrypt
+.type sm4_v8_ecb_encrypt,%function
+.align 5
+sm4_v8_ecb_encrypt:
+ AARCH64_VALID_CALL_TARGET
+ ld1 {v0.4s,v1.4s,v2.4s,v3.4s},[x3],#64
+ ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x3]
+1:
+ cmp x2,#64
+ b.lt 1f
+ ld1 {v16.4s,v17.4s,v18.4s,v19.4s},[x0],#64
+ cmp x2,#128
+ b.lt 2f
+ ld1 {v20.4s,v21.4s,v22.4s,v23.4s},[x0],#64
+ // 8 blocks
+#ifndef __AARCH64EB__
+ rev32 v16.16b,v16.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v17.16b,v17.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v18.16b,v18.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v19.16b,v19.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v20.16b,v20.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v21.16b,v21.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v22.16b,v22.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v23.16b,v23.16b
+#endif
+.inst 0xcec08410 //sm4e v16.4s,v0.4s
+.inst 0xcec08411 //sm4e v17.4s,v0.4s
+.inst 0xcec08412 //sm4e v18.4s,v0.4s
+.inst 0xcec08413 //sm4e v19.4s,v0.4s
+
+.inst 0xcec08430 //sm4e v16.4s,v1.4s
+.inst 0xcec08431 //sm4e v17.4s,v1.4s
+.inst 0xcec08432 //sm4e v18.4s,v1.4s
+.inst 0xcec08433 //sm4e v19.4s,v1.4s
+
+.inst 0xcec08450 //sm4e v16.4s,v2.4s
+.inst 0xcec08451 //sm4e v17.4s,v2.4s
+.inst 0xcec08452 //sm4e v18.4s,v2.4s
+.inst 0xcec08453 //sm4e v19.4s,v2.4s
+
+.inst 0xcec08470 //sm4e v16.4s,v3.4s
+.inst 0xcec08471 //sm4e v17.4s,v3.4s
+.inst 0xcec08472 //sm4e v18.4s,v3.4s
+.inst 0xcec08473 //sm4e v19.4s,v3.4s
+
+.inst 0xcec08490 //sm4e v16.4s,v4.4s
+.inst 0xcec08491 //sm4e v17.4s,v4.4s
+.inst 0xcec08492 //sm4e v18.4s,v4.4s
+.inst 0xcec08493 //sm4e v19.4s,v4.4s
+
+.inst 0xcec084b0 //sm4e v16.4s,v5.4s
+.inst 0xcec084b1 //sm4e v17.4s,v5.4s
+.inst 0xcec084b2 //sm4e v18.4s,v5.4s
+.inst 0xcec084b3 //sm4e v19.4s,v5.4s
+
+.inst 0xcec084d0 //sm4e v16.4s,v6.4s
+.inst 0xcec084d1 //sm4e v17.4s,v6.4s
+.inst 0xcec084d2 //sm4e v18.4s,v6.4s
+.inst 0xcec084d3 //sm4e v19.4s,v6.4s
+
+.inst 0xcec084f0 //sm4e v16.4s,v7.4s
+ rev64 v16.4S,v16.4S
+.inst 0xcec084f1 //sm4e v17.4s,v7.4s
+ ext v16.16b,v16.16b,v16.16b,#8
+ rev64 v17.4S,v17.4S
+.inst 0xcec084f2 //sm4e v18.4s,v7.4s
+ ext v17.16b,v17.16b,v17.16b,#8
+ rev64 v18.4S,v18.4S
+.inst 0xcec084f3 //sm4e v19.4s,v7.4s
+ ext v18.16b,v18.16b,v18.16b,#8
+ rev64 v19.4S,v19.4S
+ ext v19.16b,v19.16b,v19.16b,#8
+.inst 0xcec08414 //sm4e v20.4s,v0.4s
+.inst 0xcec08415 //sm4e v21.4s,v0.4s
+.inst 0xcec08416 //sm4e v22.4s,v0.4s
+.inst 0xcec08417 //sm4e v23.4s,v0.4s
+
+.inst 0xcec08434 //sm4e v20.4s,v1.4s
+.inst 0xcec08435 //sm4e v21.4s,v1.4s
+.inst 0xcec08436 //sm4e v22.4s,v1.4s
+.inst 0xcec08437 //sm4e v23.4s,v1.4s
+
+.inst 0xcec08454 //sm4e v20.4s,v2.4s
+.inst 0xcec08455 //sm4e v21.4s,v2.4s
+.inst 0xcec08456 //sm4e v22.4s,v2.4s
+.inst 0xcec08457 //sm4e v23.4s,v2.4s
+
+.inst 0xcec08474 //sm4e v20.4s,v3.4s
+.inst 0xcec08475 //sm4e v21.4s,v3.4s
+.inst 0xcec08476 //sm4e v22.4s,v3.4s
+.inst 0xcec08477 //sm4e v23.4s,v3.4s
+
+.inst 0xcec08494 //sm4e v20.4s,v4.4s
+.inst 0xcec08495 //sm4e v21.4s,v4.4s
+.inst 0xcec08496 //sm4e v22.4s,v4.4s
+.inst 0xcec08497 //sm4e v23.4s,v4.4s
+
+.inst 0xcec084b4 //sm4e v20.4s,v5.4s
+.inst 0xcec084b5 //sm4e v21.4s,v5.4s
+.inst 0xcec084b6 //sm4e v22.4s,v5.4s
+.inst 0xcec084b7 //sm4e v23.4s,v5.4s
+
+.inst 0xcec084d4 //sm4e v20.4s,v6.4s
+.inst 0xcec084d5 //sm4e v21.4s,v6.4s
+.inst 0xcec084d6 //sm4e v22.4s,v6.4s
+.inst 0xcec084d7 //sm4e v23.4s,v6.4s
+
+.inst 0xcec084f4 //sm4e v20.4s,v7.4s
+ rev64 v20.4S,v20.4S
+.inst 0xcec084f5 //sm4e v21.4s,v7.4s
+ ext v20.16b,v20.16b,v20.16b,#8
+ rev64 v21.4S,v21.4S
+.inst 0xcec084f6 //sm4e v22.4s,v7.4s
+ ext v21.16b,v21.16b,v21.16b,#8
+ rev64 v22.4S,v22.4S
+.inst 0xcec084f7 //sm4e v23.4s,v7.4s
+ ext v22.16b,v22.16b,v22.16b,#8
+ rev64 v23.4S,v23.4S
+ ext v23.16b,v23.16b,v23.16b,#8
+#ifndef __AARCH64EB__
+ rev32 v16.16b,v16.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v17.16b,v17.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v18.16b,v18.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v19.16b,v19.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v20.16b,v20.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v21.16b,v21.16b
+#endif
+ st1 {v16.4s,v17.4s,v18.4s,v19.4s},[x1],#64
+#ifndef __AARCH64EB__
+ rev32 v22.16b,v22.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v23.16b,v23.16b
+#endif
+ st1 {v20.4s,v21.4s,v22.4s,v23.4s},[x1],#64
+ subs x2,x2,#128
+ b.gt 1b
+ ret
+ // 4 blocks
+2:
+#ifndef __AARCH64EB__
+ rev32 v16.16b,v16.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v17.16b,v17.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v18.16b,v18.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v19.16b,v19.16b
+#endif
+.inst 0xcec08410 //sm4e v16.4s,v0.4s
+.inst 0xcec08411 //sm4e v17.4s,v0.4s
+.inst 0xcec08412 //sm4e v18.4s,v0.4s
+.inst 0xcec08413 //sm4e v19.4s,v0.4s
+
+.inst 0xcec08430 //sm4e v16.4s,v1.4s
+.inst 0xcec08431 //sm4e v17.4s,v1.4s
+.inst 0xcec08432 //sm4e v18.4s,v1.4s
+.inst 0xcec08433 //sm4e v19.4s,v1.4s
+
+.inst 0xcec08450 //sm4e v16.4s,v2.4s
+.inst 0xcec08451 //sm4e v17.4s,v2.4s
+.inst 0xcec08452 //sm4e v18.4s,v2.4s
+.inst 0xcec08453 //sm4e v19.4s,v2.4s
+
+.inst 0xcec08470 //sm4e v16.4s,v3.4s
+.inst 0xcec08471 //sm4e v17.4s,v3.4s
+.inst 0xcec08472 //sm4e v18.4s,v3.4s
+.inst 0xcec08473 //sm4e v19.4s,v3.4s
+
+.inst 0xcec08490 //sm4e v16.4s,v4.4s
+.inst 0xcec08491 //sm4e v17.4s,v4.4s
+.inst 0xcec08492 //sm4e v18.4s,v4.4s
+.inst 0xcec08493 //sm4e v19.4s,v4.4s
+
+.inst 0xcec084b0 //sm4e v16.4s,v5.4s
+.inst 0xcec084b1 //sm4e v17.4s,v5.4s
+.inst 0xcec084b2 //sm4e v18.4s,v5.4s
+.inst 0xcec084b3 //sm4e v19.4s,v5.4s
+
+.inst 0xcec084d0 //sm4e v16.4s,v6.4s
+.inst 0xcec084d1 //sm4e v17.4s,v6.4s
+.inst 0xcec084d2 //sm4e v18.4s,v6.4s
+.inst 0xcec084d3 //sm4e v19.4s,v6.4s
+
+.inst 0xcec084f0 //sm4e v16.4s,v7.4s
+ rev64 v16.4S,v16.4S
+.inst 0xcec084f1 //sm4e v17.4s,v7.4s
+ ext v16.16b,v16.16b,v16.16b,#8
+ rev64 v17.4S,v17.4S
+.inst 0xcec084f2 //sm4e v18.4s,v7.4s
+ ext v17.16b,v17.16b,v17.16b,#8
+ rev64 v18.4S,v18.4S
+.inst 0xcec084f3 //sm4e v19.4s,v7.4s
+ ext v18.16b,v18.16b,v18.16b,#8
+ rev64 v19.4S,v19.4S
+ ext v19.16b,v19.16b,v19.16b,#8
+#ifndef __AARCH64EB__
+ rev32 v16.16b,v16.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v17.16b,v17.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v18.16b,v18.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v19.16b,v19.16b
+#endif
+ st1 {v16.4s,v17.4s,v18.4s,v19.4s},[x1],#64
+ subs x2,x2,#64
+ b.gt 1b
+1:
+ subs x2,x2,#16
+ b.lt 1f
+ ld1 {v16.4s},[x0],#16
+#ifndef __AARCH64EB__
+ rev32 v16.16b,v16.16b
+#endif
+.inst 0xcec08410 //sm4e v16.4s,v0.4s
+.inst 0xcec08430 //sm4e v16.4s,v1.4s
+.inst 0xcec08450 //sm4e v16.4s,v2.4s
+.inst 0xcec08470 //sm4e v16.4s,v3.4s
+.inst 0xcec08490 //sm4e v16.4s,v4.4s
+.inst 0xcec084b0 //sm4e v16.4s,v5.4s
+.inst 0xcec084d0 //sm4e v16.4s,v6.4s
+.inst 0xcec084f0 //sm4e v16.4s,v7.4s
+ rev64 v16.4S,v16.4S
+ ext v16.16b,v16.16b,v16.16b,#8
+#ifndef __AARCH64EB__
+ rev32 v16.16b,v16.16b
+#endif
+ st1 {v16.4s},[x1],#16
+ b.ne 1b
+1:
+ ret
+.size sm4_v8_ecb_encrypt,.-sm4_v8_ecb_encrypt
+.globl sm4_v8_cbc_encrypt
+.type sm4_v8_cbc_encrypt,%function
+.align 5
+sm4_v8_cbc_encrypt:
+ AARCH64_VALID_CALL_TARGET
+ stp d8,d9,[sp, #-16]!
+
+ ld1 {v0.4s,v1.4s,v2.4s,v3.4s},[x3],#64
+ ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x3]
+ ld1 {v8.4s},[x4]
+ cmp w5,#0
+ b.eq .Ldec
+1:
+ cmp x2, #64
+ b.lt 1f
+ ld1 {v16.4s,v17.4s,v18.4s,v19.4s},[x0],#64
+ eor v16.16b,v16.16b,v8.16b
+#ifndef __AARCH64EB__
+ rev32 v17.16b,v17.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v16.16b,v16.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v18.16b,v18.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v19.16b,v19.16b
+#endif
+.inst 0xcec08410 //sm4e v16.4s,v0.4s
+.inst 0xcec08430 //sm4e v16.4s,v1.4s
+.inst 0xcec08450 //sm4e v16.4s,v2.4s
+.inst 0xcec08470 //sm4e v16.4s,v3.4s
+.inst 0xcec08490 //sm4e v16.4s,v4.4s
+.inst 0xcec084b0 //sm4e v16.4s,v5.4s
+.inst 0xcec084d0 //sm4e v16.4s,v6.4s
+.inst 0xcec084f0 //sm4e v16.4s,v7.4s
+ rev64 v16.4S,v16.4S
+ ext v16.16b,v16.16b,v16.16b,#8
+ eor v17.16b,v17.16b,v16.16b
+.inst 0xcec08411 //sm4e v17.4s,v0.4s
+.inst 0xcec08431 //sm4e v17.4s,v1.4s
+.inst 0xcec08451 //sm4e v17.4s,v2.4s
+.inst 0xcec08471 //sm4e v17.4s,v3.4s
+.inst 0xcec08491 //sm4e v17.4s,v4.4s
+.inst 0xcec084b1 //sm4e v17.4s,v5.4s
+.inst 0xcec084d1 //sm4e v17.4s,v6.4s
+.inst 0xcec084f1 //sm4e v17.4s,v7.4s
+ rev64 v17.4S,v17.4S
+ ext v17.16b,v17.16b,v17.16b,#8
+#ifndef __AARCH64EB__
+ rev32 v16.16b,v16.16b
+#endif
+ eor v18.16b,v18.16b,v17.16b
+.inst 0xcec08412 //sm4e v18.4s,v0.4s
+.inst 0xcec08432 //sm4e v18.4s,v1.4s
+.inst 0xcec08452 //sm4e v18.4s,v2.4s
+.inst 0xcec08472 //sm4e v18.4s,v3.4s
+.inst 0xcec08492 //sm4e v18.4s,v4.4s
+.inst 0xcec084b2 //sm4e v18.4s,v5.4s
+.inst 0xcec084d2 //sm4e v18.4s,v6.4s
+.inst 0xcec084f2 //sm4e v18.4s,v7.4s
+ rev64 v18.4S,v18.4S
+ ext v18.16b,v18.16b,v18.16b,#8
+#ifndef __AARCH64EB__
+ rev32 v17.16b,v17.16b
+#endif
+ eor v19.16b,v19.16b,v18.16b
+.inst 0xcec08413 //sm4e v19.4s,v0.4s
+.inst 0xcec08433 //sm4e v19.4s,v1.4s
+.inst 0xcec08453 //sm4e v19.4s,v2.4s
+.inst 0xcec08473 //sm4e v19.4s,v3.4s
+.inst 0xcec08493 //sm4e v19.4s,v4.4s
+.inst 0xcec084b3 //sm4e v19.4s,v5.4s
+.inst 0xcec084d3 //sm4e v19.4s,v6.4s
+.inst 0xcec084f3 //sm4e v19.4s,v7.4s
+ rev64 v19.4S,v19.4S
+ ext v19.16b,v19.16b,v19.16b,#8
+#ifndef __AARCH64EB__
+ rev32 v18.16b,v18.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v19.16b,v19.16b
+#endif
+ mov v8.16b,v19.16b
+ st1 {v16.4s,v17.4s,v18.4s,v19.4s},[x1],#64
+ subs x2,x2,#64
+ b.ne 1b
+1:
+ subs x2,x2,#16
+ b.lt 3f
+ ld1 {v16.4s},[x0],#16
+ eor v8.16b,v8.16b,v16.16b
+#ifndef __AARCH64EB__
+ rev32 v8.16b,v8.16b
+#endif
+.inst 0xcec08408 //sm4e v8.4s,v0.4s
+.inst 0xcec08428 //sm4e v8.4s,v1.4s
+.inst 0xcec08448 //sm4e v8.4s,v2.4s
+.inst 0xcec08468 //sm4e v8.4s,v3.4s
+.inst 0xcec08488 //sm4e v8.4s,v4.4s
+.inst 0xcec084a8 //sm4e v8.4s,v5.4s
+.inst 0xcec084c8 //sm4e v8.4s,v6.4s
+.inst 0xcec084e8 //sm4e v8.4s,v7.4s
+ rev64 v8.4S,v8.4S
+ ext v8.16b,v8.16b,v8.16b,#8
+#ifndef __AARCH64EB__
+ rev32 v8.16b,v8.16b
+#endif
+ st1 {v8.4s},[x1],#16
+ b.ne 1b
+ b 3f
+.Ldec:
+1:
+ cmp x2, #64
+ b.lt 1f
+ ld1 {v16.4s,v17.4s,v18.4s,v19.4s},[x0]
+ ld1 {v24.4s,v25.4s,v26.4s,v27.4s},[x0],#64
+ cmp x2,#128
+ b.lt 2f
+ // 8 blocks mode
+ ld1 {v20.4s,v21.4s,v22.4s,v23.4s},[x0]
+ ld1 {v28.4s,v29.4s,v30.4s,v31.4s},[x0],#64
+#ifndef __AARCH64EB__
+ rev32 v16.16b,v16.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v17.16b,v17.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v18.16b,v18.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v19.16b,v19.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v20.16b,v20.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v21.16b,v21.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v22.16b,v22.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v23.16b,v23.16b
+#endif
+.inst 0xcec08410 //sm4e v16.4s,v0.4s
+.inst 0xcec08411 //sm4e v17.4s,v0.4s
+.inst 0xcec08412 //sm4e v18.4s,v0.4s
+.inst 0xcec08413 //sm4e v19.4s,v0.4s
+
+.inst 0xcec08430 //sm4e v16.4s,v1.4s
+.inst 0xcec08431 //sm4e v17.4s,v1.4s
+.inst 0xcec08432 //sm4e v18.4s,v1.4s
+.inst 0xcec08433 //sm4e v19.4s,v1.4s
+
+.inst 0xcec08450 //sm4e v16.4s,v2.4s
+.inst 0xcec08451 //sm4e v17.4s,v2.4s
+.inst 0xcec08452 //sm4e v18.4s,v2.4s
+.inst 0xcec08453 //sm4e v19.4s,v2.4s
+
+.inst 0xcec08470 //sm4e v16.4s,v3.4s
+.inst 0xcec08471 //sm4e v17.4s,v3.4s
+.inst 0xcec08472 //sm4e v18.4s,v3.4s
+.inst 0xcec08473 //sm4e v19.4s,v3.4s
+
+.inst 0xcec08490 //sm4e v16.4s,v4.4s
+.inst 0xcec08491 //sm4e v17.4s,v4.4s
+.inst 0xcec08492 //sm4e v18.4s,v4.4s
+.inst 0xcec08493 //sm4e v19.4s,v4.4s
+
+.inst 0xcec084b0 //sm4e v16.4s,v5.4s
+.inst 0xcec084b1 //sm4e v17.4s,v5.4s
+.inst 0xcec084b2 //sm4e v18.4s,v5.4s
+.inst 0xcec084b3 //sm4e v19.4s,v5.4s
+
+.inst 0xcec084d0 //sm4e v16.4s,v6.4s
+.inst 0xcec084d1 //sm4e v17.4s,v6.4s
+.inst 0xcec084d2 //sm4e v18.4s,v6.4s
+.inst 0xcec084d3 //sm4e v19.4s,v6.4s
+
+.inst 0xcec084f0 //sm4e v16.4s,v7.4s
+ rev64 v16.4S,v16.4S
+.inst 0xcec084f1 //sm4e v17.4s,v7.4s
+ ext v16.16b,v16.16b,v16.16b,#8
+ rev64 v17.4S,v17.4S
+.inst 0xcec084f2 //sm4e v18.4s,v7.4s
+ ext v17.16b,v17.16b,v17.16b,#8
+ rev64 v18.4S,v18.4S
+.inst 0xcec084f3 //sm4e v19.4s,v7.4s
+ ext v18.16b,v18.16b,v18.16b,#8
+ rev64 v19.4S,v19.4S
+ ext v19.16b,v19.16b,v19.16b,#8
+.inst 0xcec08414 //sm4e v20.4s,v0.4s
+.inst 0xcec08415 //sm4e v21.4s,v0.4s
+.inst 0xcec08416 //sm4e v22.4s,v0.4s
+.inst 0xcec08417 //sm4e v23.4s,v0.4s
+
+.inst 0xcec08434 //sm4e v20.4s,v1.4s
+.inst 0xcec08435 //sm4e v21.4s,v1.4s
+.inst 0xcec08436 //sm4e v22.4s,v1.4s
+.inst 0xcec08437 //sm4e v23.4s,v1.4s
+
+.inst 0xcec08454 //sm4e v20.4s,v2.4s
+.inst 0xcec08455 //sm4e v21.4s,v2.4s
+.inst 0xcec08456 //sm4e v22.4s,v2.4s
+.inst 0xcec08457 //sm4e v23.4s,v2.4s
+
+.inst 0xcec08474 //sm4e v20.4s,v3.4s
+.inst 0xcec08475 //sm4e v21.4s,v3.4s
+.inst 0xcec08476 //sm4e v22.4s,v3.4s
+.inst 0xcec08477 //sm4e v23.4s,v3.4s
+
+.inst 0xcec08494 //sm4e v20.4s,v4.4s
+.inst 0xcec08495 //sm4e v21.4s,v4.4s
+.inst 0xcec08496 //sm4e v22.4s,v4.4s
+.inst 0xcec08497 //sm4e v23.4s,v4.4s
+
+.inst 0xcec084b4 //sm4e v20.4s,v5.4s
+.inst 0xcec084b5 //sm4e v21.4s,v5.4s
+.inst 0xcec084b6 //sm4e v22.4s,v5.4s
+.inst 0xcec084b7 //sm4e v23.4s,v5.4s
+
+.inst 0xcec084d4 //sm4e v20.4s,v6.4s
+.inst 0xcec084d5 //sm4e v21.4s,v6.4s
+.inst 0xcec084d6 //sm4e v22.4s,v6.4s
+.inst 0xcec084d7 //sm4e v23.4s,v6.4s
+
+.inst 0xcec084f4 //sm4e v20.4s,v7.4s
+ rev64 v20.4S,v20.4S
+.inst 0xcec084f5 //sm4e v21.4s,v7.4s
+ ext v20.16b,v20.16b,v20.16b,#8
+ rev64 v21.4S,v21.4S
+.inst 0xcec084f6 //sm4e v22.4s,v7.4s
+ ext v21.16b,v21.16b,v21.16b,#8
+ rev64 v22.4S,v22.4S
+.inst 0xcec084f7 //sm4e v23.4s,v7.4s
+ ext v22.16b,v22.16b,v22.16b,#8
+ rev64 v23.4S,v23.4S
+ ext v23.16b,v23.16b,v23.16b,#8
+#ifndef __AARCH64EB__
+ rev32 v16.16b,v16.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v17.16b,v17.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v18.16b,v18.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v19.16b,v19.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v20.16b,v20.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v21.16b,v21.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v22.16b,v22.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v23.16b,v23.16b
+#endif
+ eor v16.16b,v16.16b,v8.16b
+ eor v17.16b,v17.16b,v24.16b
+ eor v18.16b,v18.16b,v25.16b
+ mov v8.16b,v31.16b
+ eor v19.16b,v19.16b,v26.16b
+ eor v20.16b,v20.16b,v27.16b
+ eor v21.16b,v21.16b,v28.16b
+ eor v22.16b,v22.16b,v29.16b
+ eor v23.16b,v23.16b,v30.16b
+ st1 {v16.4s,v17.4s,v18.4s,v19.4s},[x1],#64
+ st1 {v20.4s,v21.4s,v22.4s,v23.4s},[x1],#64
+ subs x2,x2,128
+ b.gt 1b
+ b 3f
+ // 4 blocks mode
+2:
+#ifndef __AARCH64EB__
+ rev32 v16.16b,v16.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v17.16b,v17.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v18.16b,v18.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v19.16b,v19.16b
+#endif
+.inst 0xcec08410 //sm4e v16.4s,v0.4s
+.inst 0xcec08411 //sm4e v17.4s,v0.4s
+.inst 0xcec08412 //sm4e v18.4s,v0.4s
+.inst 0xcec08413 //sm4e v19.4s,v0.4s
+
+.inst 0xcec08430 //sm4e v16.4s,v1.4s
+.inst 0xcec08431 //sm4e v17.4s,v1.4s
+.inst 0xcec08432 //sm4e v18.4s,v1.4s
+.inst 0xcec08433 //sm4e v19.4s,v1.4s
+
+.inst 0xcec08450 //sm4e v16.4s,v2.4s
+.inst 0xcec08451 //sm4e v17.4s,v2.4s
+.inst 0xcec08452 //sm4e v18.4s,v2.4s
+.inst 0xcec08453 //sm4e v19.4s,v2.4s
+
+.inst 0xcec08470 //sm4e v16.4s,v3.4s
+.inst 0xcec08471 //sm4e v17.4s,v3.4s
+.inst 0xcec08472 //sm4e v18.4s,v3.4s
+.inst 0xcec08473 //sm4e v19.4s,v3.4s
+
+.inst 0xcec08490 //sm4e v16.4s,v4.4s
+.inst 0xcec08491 //sm4e v17.4s,v4.4s
+.inst 0xcec08492 //sm4e v18.4s,v4.4s
+.inst 0xcec08493 //sm4e v19.4s,v4.4s
+
+.inst 0xcec084b0 //sm4e v16.4s,v5.4s
+.inst 0xcec084b1 //sm4e v17.4s,v5.4s
+.inst 0xcec084b2 //sm4e v18.4s,v5.4s
+.inst 0xcec084b3 //sm4e v19.4s,v5.4s
+
+.inst 0xcec084d0 //sm4e v16.4s,v6.4s
+.inst 0xcec084d1 //sm4e v17.4s,v6.4s
+.inst 0xcec084d2 //sm4e v18.4s,v6.4s
+.inst 0xcec084d3 //sm4e v19.4s,v6.4s
+
+.inst 0xcec084f0 //sm4e v16.4s,v7.4s
+ rev64 v16.4S,v16.4S
+.inst 0xcec084f1 //sm4e v17.4s,v7.4s
+ ext v16.16b,v16.16b,v16.16b,#8
+ rev64 v17.4S,v17.4S
+.inst 0xcec084f2 //sm4e v18.4s,v7.4s
+ ext v17.16b,v17.16b,v17.16b,#8
+ rev64 v18.4S,v18.4S
+.inst 0xcec084f3 //sm4e v19.4s,v7.4s
+ ext v18.16b,v18.16b,v18.16b,#8
+ rev64 v19.4S,v19.4S
+ ext v19.16b,v19.16b,v19.16b,#8
+#ifndef __AARCH64EB__
+ rev32 v16.16b,v16.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v17.16b,v17.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v18.16b,v18.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v19.16b,v19.16b
+#endif
+ eor v16.16b,v16.16b,v8.16b
+ eor v17.16b,v17.16b,v24.16b
+ mov v8.16b,v27.16b
+ eor v18.16b,v18.16b,v25.16b
+ eor v19.16b,v19.16b,v26.16b
+ st1 {v16.4s,v17.4s,v18.4s,v19.4s},[x1],#64
+ subs x2,x2,#64
+ b.gt 1b
+1:
+ subs x2,x2,#16
+ b.lt 3f
+ ld1 {v16.4s},[x0],#16
+ mov v24.16b,v16.16b
+#ifndef __AARCH64EB__
+ rev32 v16.16b,v16.16b
+#endif
+.inst 0xcec08410 //sm4e v16.4s,v0.4s
+.inst 0xcec08430 //sm4e v16.4s,v1.4s
+.inst 0xcec08450 //sm4e v16.4s,v2.4s
+.inst 0xcec08470 //sm4e v16.4s,v3.4s
+.inst 0xcec08490 //sm4e v16.4s,v4.4s
+.inst 0xcec084b0 //sm4e v16.4s,v5.4s
+.inst 0xcec084d0 //sm4e v16.4s,v6.4s
+.inst 0xcec084f0 //sm4e v16.4s,v7.4s
+ rev64 v16.4S,v16.4S
+ ext v16.16b,v16.16b,v16.16b,#8
+#ifndef __AARCH64EB__
+ rev32 v16.16b,v16.16b
+#endif
+ eor v16.16b,v16.16b,v8.16b
+ mov v8.16b,v24.16b
+ st1 {v16.4s},[x1],#16
+ b.ne 1b
+3:
+ // save back IV
+ st1 {v8.4s},[x4]
+ ldp d8,d9,[sp],#16
+ ret
+.size sm4_v8_cbc_encrypt,.-sm4_v8_cbc_encrypt
+.globl sm4_v8_ctr32_encrypt_blocks
+.type sm4_v8_ctr32_encrypt_blocks,%function
+.align 5
+sm4_v8_ctr32_encrypt_blocks:
+ AARCH64_VALID_CALL_TARGET
+ stp d8,d9,[sp, #-16]!
+
+ ld1 {v8.4s},[x4]
+ ld1 {v0.4s,v1.4s,v2.4s,v3.4s},[x3],64
+ ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x3]
+#ifndef __AARCH64EB__
+ rev32 v8.16b,v8.16b
+#endif
+ mov w5,v8.s[3]
+1:
+ cmp x2,#4
+ b.lt 1f
+ ld1 {v24.4s,v25.4s,v26.4s,v27.4s},[x0],#64
+ mov v16.16b,v8.16b
+ mov v17.16b,v8.16b
+ mov v18.16b,v8.16b
+ mov v19.16b,v8.16b
+ add w5,w5,#1
+ mov v17.s[3],w5
+ add w5,w5,#1
+ mov v18.s[3],w5
+ add w5,w5,#1
+ mov v19.s[3],w5
+ cmp x2,#8
+ b.lt 2f
+ ld1 {v28.4s,v29.4s,v30.4s,v31.4s},[x0],#64
+ mov v20.16b,v8.16b
+ mov v21.16b,v8.16b
+ mov v22.16b,v8.16b
+ mov v23.16b,v8.16b
+ add w5,w5,#1
+ mov v20.s[3],w5
+ add w5,w5,#1
+ mov v21.s[3],w5
+ add w5,w5,#1
+ mov v22.s[3],w5
+ add w5,w5,#1
+ mov v23.s[3],w5
+.inst 0xcec08410 //sm4e v16.4s,v0.4s
+.inst 0xcec08411 //sm4e v17.4s,v0.4s
+.inst 0xcec08412 //sm4e v18.4s,v0.4s
+.inst 0xcec08413 //sm4e v19.4s,v0.4s
+
+.inst 0xcec08430 //sm4e v16.4s,v1.4s
+.inst 0xcec08431 //sm4e v17.4s,v1.4s
+.inst 0xcec08432 //sm4e v18.4s,v1.4s
+.inst 0xcec08433 //sm4e v19.4s,v1.4s
+
+.inst 0xcec08450 //sm4e v16.4s,v2.4s
+.inst 0xcec08451 //sm4e v17.4s,v2.4s
+.inst 0xcec08452 //sm4e v18.4s,v2.4s
+.inst 0xcec08453 //sm4e v19.4s,v2.4s
+
+.inst 0xcec08470 //sm4e v16.4s,v3.4s
+.inst 0xcec08471 //sm4e v17.4s,v3.4s
+.inst 0xcec08472 //sm4e v18.4s,v3.4s
+.inst 0xcec08473 //sm4e v19.4s,v3.4s
+
+.inst 0xcec08490 //sm4e v16.4s,v4.4s
+.inst 0xcec08491 //sm4e v17.4s,v4.4s
+.inst 0xcec08492 //sm4e v18.4s,v4.4s
+.inst 0xcec08493 //sm4e v19.4s,v4.4s
+
+.inst 0xcec084b0 //sm4e v16.4s,v5.4s
+.inst 0xcec084b1 //sm4e v17.4s,v5.4s
+.inst 0xcec084b2 //sm4e v18.4s,v5.4s
+.inst 0xcec084b3 //sm4e v19.4s,v5.4s
+
+.inst 0xcec084d0 //sm4e v16.4s,v6.4s
+.inst 0xcec084d1 //sm4e v17.4s,v6.4s
+.inst 0xcec084d2 //sm4e v18.4s,v6.4s
+.inst 0xcec084d3 //sm4e v19.4s,v6.4s
+
+.inst 0xcec084f0 //sm4e v16.4s,v7.4s
+ rev64 v16.4S,v16.4S
+.inst 0xcec084f1 //sm4e v17.4s,v7.4s
+ ext v16.16b,v16.16b,v16.16b,#8
+ rev64 v17.4S,v17.4S
+.inst 0xcec084f2 //sm4e v18.4s,v7.4s
+ ext v17.16b,v17.16b,v17.16b,#8
+ rev64 v18.4S,v18.4S
+.inst 0xcec084f3 //sm4e v19.4s,v7.4s
+ ext v18.16b,v18.16b,v18.16b,#8
+ rev64 v19.4S,v19.4S
+ ext v19.16b,v19.16b,v19.16b,#8
+.inst 0xcec08414 //sm4e v20.4s,v0.4s
+.inst 0xcec08415 //sm4e v21.4s,v0.4s
+.inst 0xcec08416 //sm4e v22.4s,v0.4s
+.inst 0xcec08417 //sm4e v23.4s,v0.4s
+
+.inst 0xcec08434 //sm4e v20.4s,v1.4s
+.inst 0xcec08435 //sm4e v21.4s,v1.4s
+.inst 0xcec08436 //sm4e v22.4s,v1.4s
+.inst 0xcec08437 //sm4e v23.4s,v1.4s
+
+.inst 0xcec08454 //sm4e v20.4s,v2.4s
+.inst 0xcec08455 //sm4e v21.4s,v2.4s
+.inst 0xcec08456 //sm4e v22.4s,v2.4s
+.inst 0xcec08457 //sm4e v23.4s,v2.4s
+
+.inst 0xcec08474 //sm4e v20.4s,v3.4s
+.inst 0xcec08475 //sm4e v21.4s,v3.4s
+.inst 0xcec08476 //sm4e v22.4s,v3.4s
+.inst 0xcec08477 //sm4e v23.4s,v3.4s
+
+.inst 0xcec08494 //sm4e v20.4s,v4.4s
+.inst 0xcec08495 //sm4e v21.4s,v4.4s
+.inst 0xcec08496 //sm4e v22.4s,v4.4s
+.inst 0xcec08497 //sm4e v23.4s,v4.4s
+
+.inst 0xcec084b4 //sm4e v20.4s,v5.4s
+.inst 0xcec084b5 //sm4e v21.4s,v5.4s
+.inst 0xcec084b6 //sm4e v22.4s,v5.4s
+.inst 0xcec084b7 //sm4e v23.4s,v5.4s
+
+.inst 0xcec084d4 //sm4e v20.4s,v6.4s
+.inst 0xcec084d5 //sm4e v21.4s,v6.4s
+.inst 0xcec084d6 //sm4e v22.4s,v6.4s
+.inst 0xcec084d7 //sm4e v23.4s,v6.4s
+
+.inst 0xcec084f4 //sm4e v20.4s,v7.4s
+ rev64 v20.4S,v20.4S
+.inst 0xcec084f5 //sm4e v21.4s,v7.4s
+ ext v20.16b,v20.16b,v20.16b,#8
+ rev64 v21.4S,v21.4S
+.inst 0xcec084f6 //sm4e v22.4s,v7.4s
+ ext v21.16b,v21.16b,v21.16b,#8
+ rev64 v22.4S,v22.4S
+.inst 0xcec084f7 //sm4e v23.4s,v7.4s
+ ext v22.16b,v22.16b,v22.16b,#8
+ rev64 v23.4S,v23.4S
+ ext v23.16b,v23.16b,v23.16b,#8
+#ifndef __AARCH64EB__
+ rev32 v16.16b,v16.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v17.16b,v17.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v18.16b,v18.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v19.16b,v19.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v20.16b,v20.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v21.16b,v21.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v22.16b,v22.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v23.16b,v23.16b
+#endif
+ eor v16.16b,v16.16b,v24.16b
+ eor v17.16b,v17.16b,v25.16b
+ eor v18.16b,v18.16b,v26.16b
+ eor v19.16b,v19.16b,v27.16b
+ eor v20.16b,v20.16b,v28.16b
+ eor v21.16b,v21.16b,v29.16b
+ eor v22.16b,v22.16b,v30.16b
+ eor v23.16b,v23.16b,v31.16b
+ st1 {v16.4s,v17.4s,v18.4s,v19.4s},[x1],#64
+ st1 {v20.4s,v21.4s,v22.4s,v23.4s},[x1],#64
+ subs x2,x2,#8
+ b.eq 3f
+ add w5,w5,#1
+ mov v8.s[3],w5
+ b 1b
+2:
+.inst 0xcec08410 //sm4e v16.4s,v0.4s
+.inst 0xcec08411 //sm4e v17.4s,v0.4s
+.inst 0xcec08412 //sm4e v18.4s,v0.4s
+.inst 0xcec08413 //sm4e v19.4s,v0.4s
+
+.inst 0xcec08430 //sm4e v16.4s,v1.4s
+.inst 0xcec08431 //sm4e v17.4s,v1.4s
+.inst 0xcec08432 //sm4e v18.4s,v1.4s
+.inst 0xcec08433 //sm4e v19.4s,v1.4s
+
+.inst 0xcec08450 //sm4e v16.4s,v2.4s
+.inst 0xcec08451 //sm4e v17.4s,v2.4s
+.inst 0xcec08452 //sm4e v18.4s,v2.4s
+.inst 0xcec08453 //sm4e v19.4s,v2.4s
+
+.inst 0xcec08470 //sm4e v16.4s,v3.4s
+.inst 0xcec08471 //sm4e v17.4s,v3.4s
+.inst 0xcec08472 //sm4e v18.4s,v3.4s
+.inst 0xcec08473 //sm4e v19.4s,v3.4s
+
+.inst 0xcec08490 //sm4e v16.4s,v4.4s
+.inst 0xcec08491 //sm4e v17.4s,v4.4s
+.inst 0xcec08492 //sm4e v18.4s,v4.4s
+.inst 0xcec08493 //sm4e v19.4s,v4.4s
+
+.inst 0xcec084b0 //sm4e v16.4s,v5.4s
+.inst 0xcec084b1 //sm4e v17.4s,v5.4s
+.inst 0xcec084b2 //sm4e v18.4s,v5.4s
+.inst 0xcec084b3 //sm4e v19.4s,v5.4s
+
+.inst 0xcec084d0 //sm4e v16.4s,v6.4s
+.inst 0xcec084d1 //sm4e v17.4s,v6.4s
+.inst 0xcec084d2 //sm4e v18.4s,v6.4s
+.inst 0xcec084d3 //sm4e v19.4s,v6.4s
+
+.inst 0xcec084f0 //sm4e v16.4s,v7.4s
+ rev64 v16.4S,v16.4S
+.inst 0xcec084f1 //sm4e v17.4s,v7.4s
+ ext v16.16b,v16.16b,v16.16b,#8
+ rev64 v17.4S,v17.4S
+.inst 0xcec084f2 //sm4e v18.4s,v7.4s
+ ext v17.16b,v17.16b,v17.16b,#8
+ rev64 v18.4S,v18.4S
+.inst 0xcec084f3 //sm4e v19.4s,v7.4s
+ ext v18.16b,v18.16b,v18.16b,#8
+ rev64 v19.4S,v19.4S
+ ext v19.16b,v19.16b,v19.16b,#8
+#ifndef __AARCH64EB__
+ rev32 v16.16b,v16.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v17.16b,v17.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v18.16b,v18.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v19.16b,v19.16b
+#endif
+ eor v16.16b,v16.16b,v24.16b
+ eor v17.16b,v17.16b,v25.16b
+ eor v18.16b,v18.16b,v26.16b
+ eor v19.16b,v19.16b,v27.16b
+ st1 {v16.4s,v17.4s,v18.4s,v19.4s},[x1],#64
+ subs x2,x2,#4
+ b.eq 3f
+ add w5,w5,#1
+ mov v8.s[3],w5
+ b 1b
+1:
+ subs x2,x2,#1
+ b.lt 3f
+ mov v16.16b,v8.16b
+ ld1 {v24.4s},[x0],#16
+.inst 0xcec08410 //sm4e v16.4s,v0.4s
+.inst 0xcec08430 //sm4e v16.4s,v1.4s
+.inst 0xcec08450 //sm4e v16.4s,v2.4s
+.inst 0xcec08470 //sm4e v16.4s,v3.4s
+.inst 0xcec08490 //sm4e v16.4s,v4.4s
+.inst 0xcec084b0 //sm4e v16.4s,v5.4s
+.inst 0xcec084d0 //sm4e v16.4s,v6.4s
+.inst 0xcec084f0 //sm4e v16.4s,v7.4s
+ rev64 v16.4S,v16.4S
+ ext v16.16b,v16.16b,v16.16b,#8
+#ifndef __AARCH64EB__
+ rev32 v16.16b,v16.16b
+#endif
+ eor v16.16b,v16.16b,v24.16b
+ st1 {v16.4s},[x1],#16
+ b.eq 3f
+ add w5,w5,#1
+ mov v8.s[3],w5
+ b 1b
+3:
+ ldp d8,d9,[sp],#16
+ ret
+.size sm4_v8_ctr32_encrypt_blocks,.-sm4_v8_ctr32_encrypt_blocks
diff --git a/sys/crypto/openssl/aarch64/vpaes-armv8.S b/sys/crypto/openssl/aarch64/vpaes-armv8.S
index c6338b00d5f6..09f0ba9a558f 100644
--- a/sys/crypto/openssl/aarch64/vpaes-armv8.S
+++ b/sys/crypto/openssl/aarch64/vpaes-armv8.S
@@ -1,7 +1,7 @@
/* Do not modify. This file is auto-generated from vpaes-armv8.pl. */
#include "arm_arch.h"
-.text
+.section .rodata
.type _vpaes_consts,%object
.align 7 // totally strategic alignment
@@ -93,6 +93,9 @@ _vpaes_consts:
.align 2
.size _vpaes_consts,.-_vpaes_consts
.align 6
+
+.text
+
//
// _aes_preheat
//
@@ -102,7 +105,8 @@ _vpaes_consts:
.type _vpaes_encrypt_preheat,%function
.align 4
_vpaes_encrypt_preheat:
- adr x10, .Lk_inv
+ adrp x10, .Lk_inv
+ add x10, x10, #:lo12:.Lk_inv
movi v17.16b, #0x0f
ld1 {v18.2d,v19.2d}, [x10],#32 // .Lk_inv
ld1 {v20.2d,v21.2d,v22.2d,v23.2d}, [x10],#64 // .Lk_ipt, .Lk_sbo
@@ -130,7 +134,8 @@ _vpaes_encrypt_preheat:
_vpaes_encrypt_core:
mov x9, x2
ldr w8, [x2,#240] // pull rounds
- adr x11, .Lk_mc_forward+16
+ adrp x11, .Lk_mc_forward+16
+ add x11, x11, #:lo12:.Lk_mc_forward+16
// vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo
ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key
and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
@@ -217,7 +222,8 @@ vpaes_encrypt:
_vpaes_encrypt_2x:
mov x9, x2
ldr w8, [x2,#240] // pull rounds
- adr x11, .Lk_mc_forward+16
+ adrp x11, .Lk_mc_forward+16
+ add x11, x11, #:lo12:.Lk_mc_forward+16
// vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo
ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key
and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
@@ -320,9 +326,11 @@ _vpaes_encrypt_2x:
.type _vpaes_decrypt_preheat,%function
.align 4
_vpaes_decrypt_preheat:
- adr x10, .Lk_inv
+ adrp x10, .Lk_inv
+ add x10, x10, #:lo12:.Lk_inv
movi v17.16b, #0x0f
- adr x11, .Lk_dipt
+ adrp x11, .Lk_dipt
+ add x11, x11, #:lo12:.Lk_dipt
ld1 {v18.2d,v19.2d}, [x10],#32 // .Lk_inv
ld1 {v20.2d,v21.2d,v22.2d,v23.2d}, [x11],#64 // .Lk_dipt, .Lk_dsbo
ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x11],#64 // .Lk_dsb9, .Lk_dsbd
@@ -344,10 +352,12 @@ _vpaes_decrypt_core:
// vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo
lsl x11, x8, #4 // mov %rax, %r11; shl $4, %r11
eor x11, x11, #0x30 // xor $0x30, %r11
- adr x10, .Lk_sr
+ adrp x10, .Lk_sr
+ add x10, x10, #:lo12:.Lk_sr
and x11, x11, #0x30 // and $0x30, %r11
add x11, x11, x10
- adr x10, .Lk_mc_forward+48
+ adrp x10, .Lk_mc_forward+48
+ add x10, x10, #:lo12:.Lk_mc_forward+48
ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key
and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
@@ -455,10 +465,12 @@ _vpaes_decrypt_2x:
// vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo
lsl x11, x8, #4 // mov %rax, %r11; shl $4, %r11
eor x11, x11, #0x30 // xor $0x30, %r11
- adr x10, .Lk_sr
+ adrp x10, .Lk_sr
+ add x10, x10, #:lo12:.Lk_sr
and x11, x11, #0x30 // and $0x30, %r11
add x11, x11, x10
- adr x10, .Lk_mc_forward+48
+ adrp x10, .Lk_mc_forward+48
+ add x10, x10, #:lo12:.Lk_mc_forward+48
ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key
and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
@@ -587,14 +599,18 @@ _vpaes_decrypt_2x:
.type _vpaes_key_preheat,%function
.align 4
_vpaes_key_preheat:
- adr x10, .Lk_inv
+ adrp x10, .Lk_inv
+ add x10, x10, #:lo12:.Lk_inv
movi v16.16b, #0x5b // .Lk_s63
- adr x11, .Lk_sb1
+ adrp x11, .Lk_sb1
+ add x11, x11, #:lo12:.Lk_sb1
movi v17.16b, #0x0f // .Lk_s0F
ld1 {v18.2d,v19.2d,v20.2d,v21.2d}, [x10] // .Lk_inv, .Lk_ipt
- adr x10, .Lk_dksd
+ adrp x10, .Lk_dksd
+ add x10, x10, #:lo12:.Lk_dksd
ld1 {v22.2d,v23.2d}, [x11] // .Lk_sb1
- adr x11, .Lk_mc_forward
+ adrp x11, .Lk_mc_forward
+ add x11, x11, #:lo12:.Lk_mc_forward
ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10],#64 // .Lk_dksd, .Lk_dksb
ld1 {v28.2d,v29.2d,v30.2d,v31.2d}, [x10],#64 // .Lk_dkse, .Lk_dks9
ld1 {v8.2d}, [x10] // .Lk_rcon
@@ -618,7 +634,8 @@ _vpaes_schedule_core:
bl _vpaes_schedule_transform
mov v7.16b, v0.16b // vmovdqa %xmm0, %xmm7
- adr x10, .Lk_sr // lea .Lk_sr(%rip),%r10
+ adrp x10, .Lk_sr
+ add x10, x10, #:lo12:.Lk_sr
add x8, x8, x10
cbnz w3, .Lschedule_am_decrypting
@@ -744,12 +761,14 @@ _vpaes_schedule_core:
.align 4
.Lschedule_mangle_last:
// schedule last round key from xmm0
- adr x11, .Lk_deskew // lea .Lk_deskew(%rip),%r11 # prepare to deskew
+ adrp x11, .Lk_deskew
+ add x11, x11, #:lo12:.Lk_deskew
cbnz w3, .Lschedule_mangle_last_dec
// encrypting
ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10),%xmm1
- adr x11, .Lk_opt // lea .Lk_opt(%rip), %r11 # prepare to output transform
+ adrp x11, .Lk_opt
+ add x11, x11, #:lo12:.Lk_opt
add x2, x2, #32 // add $32, %rdx
tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 # output permute
diff --git a/sys/crypto/openssl/aarch64/vpsm4-armv8.S b/sys/crypto/openssl/aarch64/vpsm4-armv8.S
new file mode 100644
index 000000000000..830e0315a2be
--- /dev/null
+++ b/sys/crypto/openssl/aarch64/vpsm4-armv8.S
@@ -0,0 +1,5021 @@
+/* Do not modify. This file is auto-generated from vpsm4-armv8.pl. */
+// Copyright 2020-2025 The OpenSSL Project Authors. All Rights Reserved.
+//
+// Licensed under the Apache License 2.0 (the "License"). You may not use
+// this file except in compliance with the License. You can obtain a copy
+// in the file LICENSE in the source distribution or at
+// https://www.openssl.org/source/license.html
+
+//
+// This module implements SM4 with ASIMD on aarch64
+//
+// Feb 2022
+//
+
+// $output is the last argument if it looks like a file (it has an extension)
+// $flavour is the first argument if it doesn't look like a file
+#include "arm_arch.h"
+.arch armv8-a
+.text
+
+.section .rodata
+.type _vpsm4_consts,%object
+.align 7
+_vpsm4_consts:
+.Lsbox:
+.byte 0xD6,0x90,0xE9,0xFE,0xCC,0xE1,0x3D,0xB7,0x16,0xB6,0x14,0xC2,0x28,0xFB,0x2C,0x05
+.byte 0x2B,0x67,0x9A,0x76,0x2A,0xBE,0x04,0xC3,0xAA,0x44,0x13,0x26,0x49,0x86,0x06,0x99
+.byte 0x9C,0x42,0x50,0xF4,0x91,0xEF,0x98,0x7A,0x33,0x54,0x0B,0x43,0xED,0xCF,0xAC,0x62
+.byte 0xE4,0xB3,0x1C,0xA9,0xC9,0x08,0xE8,0x95,0x80,0xDF,0x94,0xFA,0x75,0x8F,0x3F,0xA6
+.byte 0x47,0x07,0xA7,0xFC,0xF3,0x73,0x17,0xBA,0x83,0x59,0x3C,0x19,0xE6,0x85,0x4F,0xA8
+.byte 0x68,0x6B,0x81,0xB2,0x71,0x64,0xDA,0x8B,0xF8,0xEB,0x0F,0x4B,0x70,0x56,0x9D,0x35
+.byte 0x1E,0x24,0x0E,0x5E,0x63,0x58,0xD1,0xA2,0x25,0x22,0x7C,0x3B,0x01,0x21,0x78,0x87
+.byte 0xD4,0x00,0x46,0x57,0x9F,0xD3,0x27,0x52,0x4C,0x36,0x02,0xE7,0xA0,0xC4,0xC8,0x9E
+.byte 0xEA,0xBF,0x8A,0xD2,0x40,0xC7,0x38,0xB5,0xA3,0xF7,0xF2,0xCE,0xF9,0x61,0x15,0xA1
+.byte 0xE0,0xAE,0x5D,0xA4,0x9B,0x34,0x1A,0x55,0xAD,0x93,0x32,0x30,0xF5,0x8C,0xB1,0xE3
+.byte 0x1D,0xF6,0xE2,0x2E,0x82,0x66,0xCA,0x60,0xC0,0x29,0x23,0xAB,0x0D,0x53,0x4E,0x6F
+.byte 0xD5,0xDB,0x37,0x45,0xDE,0xFD,0x8E,0x2F,0x03,0xFF,0x6A,0x72,0x6D,0x6C,0x5B,0x51
+.byte 0x8D,0x1B,0xAF,0x92,0xBB,0xDD,0xBC,0x7F,0x11,0xD9,0x5C,0x41,0x1F,0x10,0x5A,0xD8
+.byte 0x0A,0xC1,0x31,0x88,0xA5,0xCD,0x7B,0xBD,0x2D,0x74,0xD0,0x12,0xB8,0xE5,0xB4,0xB0
+.byte 0x89,0x69,0x97,0x4A,0x0C,0x96,0x77,0x7E,0x65,0xB9,0xF1,0x09,0xC5,0x6E,0xC6,0x84
+.byte 0x18,0xF0,0x7D,0xEC,0x3A,0xDC,0x4D,0x20,0x79,0xEE,0x5F,0x3E,0xD7,0xCB,0x39,0x48
+.Lck:
+.long 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269
+.long 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9
+.long 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249
+.long 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9
+.long 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229
+.long 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299
+.long 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209
+.long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279
+.Lfk:
+.quad 0x56aa3350a3b1bac6,0xb27022dc677d9197
+.Lshuffles:
+.quad 0x0B0A090807060504,0x030201000F0E0D0C
+.Lxts_magic:
+.quad 0x0101010101010187,0x0101010101010101
+
+.size _vpsm4_consts,.-_vpsm4_consts
+
+.previous
+
+.type _vpsm4_set_key,%function
+.align 4
+_vpsm4_set_key:
+ AARCH64_VALID_CALL_TARGET
+ ld1 {v5.4s},[x0]
+ adrp x10,.Lsbox
+ add x10,x10,#:lo12:.Lsbox
+ ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x10],#64
+ ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x10],#64
+ ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x10],#64
+ ld1 {v28.16b,v29.16b,v30.16b,v31.16b},[x10]
+#ifndef __AARCH64EB__
+ rev32 v5.16b,v5.16b
+#endif
+ adrp x5,.Lshuffles
+ add x5,x5,#:lo12:.Lshuffles
+ ld1 {v7.2d},[x5]
+ adrp x5,.Lfk
+ add x5,x5,#:lo12:.Lfk
+ ld1 {v6.2d},[x5]
+ eor v5.16b,v5.16b,v6.16b
+ mov x6,#32
+ adrp x5,.Lck
+ add x5,x5,#:lo12:.Lck
+ movi v0.16b,#64
+ cbnz w2,1f
+ add x1,x1,124
+1:
+ mov w7,v5.s[1]
+ ldr w8,[x5],#4
+ eor w8,w8,w7
+ mov w7,v5.s[2]
+ eor w8,w8,w7
+ mov w7,v5.s[3]
+ eor w8,w8,w7
+ // sbox lookup
+ mov v4.s[0],w8
+ tbl v1.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v4.16b
+ sub v4.16b,v4.16b,v0.16b
+ tbx v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v4.16b
+ sub v4.16b,v4.16b,v0.16b
+ tbx v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v4.16b
+ sub v4.16b,v4.16b,v0.16b
+ tbx v1.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v4.16b
+ mov w7,v1.s[0]
+ eor w8,w7,w7,ror #19
+ eor w8,w8,w7,ror #9
+ mov w7,v5.s[0]
+ eor w8,w8,w7
+ mov v5.s[0],w8
+ cbz w2,2f
+ str w8,[x1],#4
+ b 3f
+2:
+ str w8,[x1],#-4
+3:
+ tbl v5.16b,{v5.16b},v7.16b
+ subs x6,x6,#1
+ b.ne 1b
+ ret
+.size _vpsm4_set_key,.-_vpsm4_set_key
+.type _vpsm4_enc_4blks,%function
+.align 4
+_vpsm4_enc_4blks:
+ AARCH64_VALID_CALL_TARGET
+ mov x10,x3
+ mov w11,#8
+10:
+ ldp w7,w8,[x10],8
+ dup v12.4s,w7
+ dup v13.4s,w8
+
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ eor v14.16b,v6.16b,v7.16b
+ eor v12.16b,v5.16b,v12.16b
+ eor v12.16b,v14.16b,v12.16b
+ movi v0.16b,#64
+ movi v1.16b,#128
+ movi v2.16b,#192
+ sub v0.16b,v12.16b,v0.16b
+ sub v1.16b,v12.16b,v1.16b
+ sub v2.16b,v12.16b,v2.16b
+ tbl v12.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v12.16b
+ tbl v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b
+ tbl v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b
+ tbl v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b
+ add v0.2d,v0.2d,v1.2d
+ add v2.2d,v2.2d,v12.2d
+ add v12.2d,v0.2d,v2.2d
+
+ ushr v0.4s,v12.4s,32-2
+ sli v0.4s,v12.4s,2
+ ushr v2.4s,v12.4s,32-10
+ eor v1.16b,v0.16b,v12.16b
+ sli v2.4s,v12.4s,10
+ eor v1.16b,v2.16b,v1.16b
+ ushr v0.4s,v12.4s,32-18
+ sli v0.4s,v12.4s,18
+ ushr v2.4s,v12.4s,32-24
+ eor v1.16b,v0.16b,v1.16b
+ sli v2.4s,v12.4s,24
+ eor v12.16b,v2.16b,v1.16b
+ eor v4.16b,v4.16b,v12.16b
+
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ eor v14.16b,v14.16b,v4.16b
+ eor v13.16b,v14.16b,v13.16b
+ movi v0.16b,#64
+ movi v1.16b,#128
+ movi v2.16b,#192
+ sub v0.16b,v13.16b,v0.16b
+ sub v1.16b,v13.16b,v1.16b
+ sub v2.16b,v13.16b,v2.16b
+ tbl v13.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v13.16b
+ tbl v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b
+ tbl v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b
+ tbl v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b
+ add v0.2d,v0.2d,v1.2d
+ add v2.2d,v2.2d,v13.2d
+ add v13.2d,v0.2d,v2.2d
+
+ ushr v0.4s,v13.4s,32-2
+ sli v0.4s,v13.4s,2
+ ushr v2.4s,v13.4s,32-10
+ eor v1.16b,v0.16b,v13.16b
+ sli v2.4s,v13.4s,10
+ eor v1.16b,v2.16b,v1.16b
+ ushr v0.4s,v13.4s,32-18
+ sli v0.4s,v13.4s,18
+ ushr v2.4s,v13.4s,32-24
+ eor v1.16b,v0.16b,v1.16b
+ sli v2.4s,v13.4s,24
+ eor v13.16b,v2.16b,v1.16b
+ ldp w7,w8,[x10],8
+ eor v5.16b,v5.16b,v13.16b
+
+ dup v12.4s,w7
+ dup v13.4s,w8
+
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ eor v14.16b,v4.16b,v5.16b
+ eor v12.16b,v7.16b,v12.16b
+ eor v12.16b,v14.16b,v12.16b
+ movi v0.16b,#64
+ movi v1.16b,#128
+ movi v2.16b,#192
+ sub v0.16b,v12.16b,v0.16b
+ sub v1.16b,v12.16b,v1.16b
+ sub v2.16b,v12.16b,v2.16b
+ tbl v12.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v12.16b
+ tbl v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b
+ tbl v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b
+ tbl v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b
+ add v0.2d,v0.2d,v1.2d
+ add v2.2d,v2.2d,v12.2d
+ add v12.2d,v0.2d,v2.2d
+
+ ushr v0.4s,v12.4s,32-2
+ sli v0.4s,v12.4s,2
+ ushr v2.4s,v12.4s,32-10
+ eor v1.16b,v0.16b,v12.16b
+ sli v2.4s,v12.4s,10
+ eor v1.16b,v2.16b,v1.16b
+ ushr v0.4s,v12.4s,32-18
+ sli v0.4s,v12.4s,18
+ ushr v2.4s,v12.4s,32-24
+ eor v1.16b,v0.16b,v1.16b
+ sli v2.4s,v12.4s,24
+ eor v12.16b,v2.16b,v1.16b
+ eor v6.16b,v6.16b,v12.16b
+
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ eor v14.16b,v14.16b,v6.16b
+ eor v13.16b,v14.16b,v13.16b
+ movi v0.16b,#64
+ movi v1.16b,#128
+ movi v2.16b,#192
+ sub v0.16b,v13.16b,v0.16b
+ sub v1.16b,v13.16b,v1.16b
+ sub v2.16b,v13.16b,v2.16b
+ tbl v13.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v13.16b
+ tbl v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b
+ tbl v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b
+ tbl v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b
+ add v0.2d,v0.2d,v1.2d
+ add v2.2d,v2.2d,v13.2d
+ add v13.2d,v0.2d,v2.2d
+
+ ushr v0.4s,v13.4s,32-2
+ sli v0.4s,v13.4s,2
+ ushr v2.4s,v13.4s,32-10
+ eor v1.16b,v0.16b,v13.16b
+ sli v2.4s,v13.4s,10
+ eor v1.16b,v2.16b,v1.16b
+ ushr v0.4s,v13.4s,32-18
+ sli v0.4s,v13.4s,18
+ ushr v2.4s,v13.4s,32-24
+ eor v1.16b,v0.16b,v1.16b
+ sli v2.4s,v13.4s,24
+ eor v13.16b,v2.16b,v1.16b
+ eor v7.16b,v7.16b,v13.16b
+ subs w11,w11,#1
+ b.ne 10b
+#ifndef __AARCH64EB__
+ rev32 v3.16b,v4.16b
+#else
+ mov v3.16b,v4.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v2.16b,v5.16b
+#else
+ mov v2.16b,v5.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v1.16b,v6.16b
+#else
+ mov v1.16b,v6.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v0.16b,v7.16b
+#else
+ mov v0.16b,v7.16b
+#endif
+ ret
+.size _vpsm4_enc_4blks,.-_vpsm4_enc_4blks
+.type _vpsm4_enc_8blks,%function
+.align 4
+_vpsm4_enc_8blks:
+ AARCH64_VALID_CALL_TARGET
+ mov x10,x3
+ mov w11,#8
+10:
+ ldp w7,w8,[x10],8
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ dup v12.4s,w7
+ eor v14.16b,v6.16b,v7.16b
+ eor v15.16b,v10.16b,v11.16b
+ eor v0.16b,v5.16b,v12.16b
+ eor v1.16b,v9.16b,v12.16b
+ eor v12.16b,v14.16b,v0.16b
+ eor v13.16b,v15.16b,v1.16b
+ movi v3.16b,#64
+ sub v0.16b,v12.16b,v3.16b
+ sub v1.16b,v0.16b,v3.16b
+ sub v2.16b,v1.16b,v3.16b
+ tbl v12.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v12.16b
+ tbl v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b
+ tbl v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b
+ tbl v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b
+ add v1.2d,v0.2d,v1.2d
+ add v12.2d,v2.2d,v12.2d
+ add v12.2d,v1.2d,v12.2d
+
+ sub v0.16b,v13.16b,v3.16b
+ sub v1.16b,v0.16b,v3.16b
+ sub v2.16b,v1.16b,v3.16b
+ tbl v13.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v13.16b
+ tbl v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b
+ tbl v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b
+ tbl v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b
+ add v1.2d,v0.2d,v1.2d
+ add v13.2d,v2.2d,v13.2d
+ add v13.2d,v1.2d,v13.2d
+
+ ushr v0.4s,v12.4s,32-2
+ sli v0.4s,v12.4s,2
+ ushr v2.4s,v13.4s,32-2
+ eor v1.16b,v0.16b,v12.16b
+ sli v2.4s,v13.4s,2
+
+ ushr v0.4s,v12.4s,32-10
+ eor v3.16b,v2.16b,v13.16b
+ sli v0.4s,v12.4s,10
+ ushr v2.4s,v13.4s,32-10
+ eor v1.16b,v0.16b,v1.16b
+ sli v2.4s,v13.4s,10
+
+ ushr v0.4s,v12.4s,32-18
+ eor v3.16b,v2.16b,v3.16b
+ sli v0.4s,v12.4s,18
+ ushr v2.4s,v13.4s,32-18
+ eor v1.16b,v0.16b,v1.16b
+ sli v2.4s,v13.4s,18
+
+ ushr v0.4s,v12.4s,32-24
+ eor v3.16b,v2.16b,v3.16b
+ sli v0.4s,v12.4s,24
+ ushr v2.4s,v13.4s,32-24
+ eor v12.16b,v0.16b,v1.16b
+ sli v2.4s,v13.4s,24
+ eor v13.16b,v2.16b,v3.16b
+ eor v4.16b,v4.16b,v12.16b
+ eor v8.16b,v8.16b,v13.16b
+
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ dup v13.4s,w8
+ eor v14.16b,v14.16b,v4.16b
+ eor v15.16b,v15.16b,v8.16b
+ eor v12.16b,v14.16b,v13.16b
+ eor v13.16b,v15.16b,v13.16b
+ movi v3.16b,#64
+ sub v0.16b,v12.16b,v3.16b
+ sub v1.16b,v0.16b,v3.16b
+ sub v2.16b,v1.16b,v3.16b
+ tbl v12.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v12.16b
+ tbl v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b
+ tbl v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b
+ tbl v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b
+ add v1.2d,v0.2d,v1.2d
+ add v12.2d,v2.2d,v12.2d
+ add v12.2d,v1.2d,v12.2d
+
+ sub v0.16b,v13.16b,v3.16b
+ sub v1.16b,v0.16b,v3.16b
+ sub v2.16b,v1.16b,v3.16b
+ tbl v13.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v13.16b
+ tbl v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b
+ tbl v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b
+ tbl v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b
+ add v1.2d,v0.2d,v1.2d
+ add v13.2d,v2.2d,v13.2d
+ add v13.2d,v1.2d,v13.2d
+
+ ushr v0.4s,v12.4s,32-2
+ sli v0.4s,v12.4s,2
+ ushr v2.4s,v13.4s,32-2
+ eor v1.16b,v0.16b,v12.16b
+ sli v2.4s,v13.4s,2
+
+ ushr v0.4s,v12.4s,32-10
+ eor v3.16b,v2.16b,v13.16b
+ sli v0.4s,v12.4s,10
+ ushr v2.4s,v13.4s,32-10
+ eor v1.16b,v0.16b,v1.16b
+ sli v2.4s,v13.4s,10
+
+ ushr v0.4s,v12.4s,32-18
+ eor v3.16b,v2.16b,v3.16b
+ sli v0.4s,v12.4s,18
+ ushr v2.4s,v13.4s,32-18
+ eor v1.16b,v0.16b,v1.16b
+ sli v2.4s,v13.4s,18
+
+ ushr v0.4s,v12.4s,32-24
+ eor v3.16b,v2.16b,v3.16b
+ sli v0.4s,v12.4s,24
+ ushr v2.4s,v13.4s,32-24
+ eor v12.16b,v0.16b,v1.16b
+ sli v2.4s,v13.4s,24
+ eor v13.16b,v2.16b,v3.16b
+ ldp w7,w8,[x10],8
+ eor v5.16b,v5.16b,v12.16b
+ eor v9.16b,v9.16b,v13.16b
+
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ dup v12.4s,w7
+ eor v14.16b,v4.16b,v5.16b
+ eor v15.16b,v8.16b,v9.16b
+ eor v0.16b,v7.16b,v12.16b
+ eor v1.16b,v11.16b,v12.16b
+ eor v12.16b,v14.16b,v0.16b
+ eor v13.16b,v15.16b,v1.16b
+ movi v3.16b,#64
+ sub v0.16b,v12.16b,v3.16b
+ sub v1.16b,v0.16b,v3.16b
+ sub v2.16b,v1.16b,v3.16b
+ tbl v12.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v12.16b
+ tbl v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b
+ tbl v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b
+ tbl v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b
+ add v1.2d,v0.2d,v1.2d
+ add v12.2d,v2.2d,v12.2d
+ add v12.2d,v1.2d,v12.2d
+
+ sub v0.16b,v13.16b,v3.16b
+ sub v1.16b,v0.16b,v3.16b
+ sub v2.16b,v1.16b,v3.16b
+ tbl v13.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v13.16b
+ tbl v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b
+ tbl v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b
+ tbl v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b
+ add v1.2d,v0.2d,v1.2d
+ add v13.2d,v2.2d,v13.2d
+ add v13.2d,v1.2d,v13.2d
+
+ ushr v0.4s,v12.4s,32-2
+ sli v0.4s,v12.4s,2
+ ushr v2.4s,v13.4s,32-2
+ eor v1.16b,v0.16b,v12.16b
+ sli v2.4s,v13.4s,2
+
+ ushr v0.4s,v12.4s,32-10
+ eor v3.16b,v2.16b,v13.16b
+ sli v0.4s,v12.4s,10
+ ushr v2.4s,v13.4s,32-10
+ eor v1.16b,v0.16b,v1.16b
+ sli v2.4s,v13.4s,10
+
+ ushr v0.4s,v12.4s,32-18
+ eor v3.16b,v2.16b,v3.16b
+ sli v0.4s,v12.4s,18
+ ushr v2.4s,v13.4s,32-18
+ eor v1.16b,v0.16b,v1.16b
+ sli v2.4s,v13.4s,18
+
+ ushr v0.4s,v12.4s,32-24
+ eor v3.16b,v2.16b,v3.16b
+ sli v0.4s,v12.4s,24
+ ushr v2.4s,v13.4s,32-24
+ eor v12.16b,v0.16b,v1.16b
+ sli v2.4s,v13.4s,24
+ eor v13.16b,v2.16b,v3.16b
+ eor v6.16b,v6.16b,v12.16b
+ eor v10.16b,v10.16b,v13.16b
+
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ dup v13.4s,w8
+ eor v14.16b,v14.16b,v6.16b
+ eor v15.16b,v15.16b,v10.16b
+ eor v12.16b,v14.16b,v13.16b
+ eor v13.16b,v15.16b,v13.16b
+ movi v3.16b,#64
+ sub v0.16b,v12.16b,v3.16b
+ sub v1.16b,v0.16b,v3.16b
+ sub v2.16b,v1.16b,v3.16b
+ tbl v12.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v12.16b
+ tbl v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b
+ tbl v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b
+ tbl v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b
+ add v1.2d,v0.2d,v1.2d
+ add v12.2d,v2.2d,v12.2d
+ add v12.2d,v1.2d,v12.2d
+
+ sub v0.16b,v13.16b,v3.16b
+ sub v1.16b,v0.16b,v3.16b
+ sub v2.16b,v1.16b,v3.16b
+ tbl v13.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v13.16b
+ tbl v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b
+ tbl v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b
+ tbl v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b
+ add v1.2d,v0.2d,v1.2d
+ add v13.2d,v2.2d,v13.2d
+ add v13.2d,v1.2d,v13.2d
+
+ ushr v0.4s,v12.4s,32-2
+ sli v0.4s,v12.4s,2
+ ushr v2.4s,v13.4s,32-2
+ eor v1.16b,v0.16b,v12.16b
+ sli v2.4s,v13.4s,2
+
+ ushr v0.4s,v12.4s,32-10
+ eor v3.16b,v2.16b,v13.16b
+ sli v0.4s,v12.4s,10
+ ushr v2.4s,v13.4s,32-10
+ eor v1.16b,v0.16b,v1.16b
+ sli v2.4s,v13.4s,10
+
+ ushr v0.4s,v12.4s,32-18
+ eor v3.16b,v2.16b,v3.16b
+ sli v0.4s,v12.4s,18
+ ushr v2.4s,v13.4s,32-18
+ eor v1.16b,v0.16b,v1.16b
+ sli v2.4s,v13.4s,18
+
+ ushr v0.4s,v12.4s,32-24
+ eor v3.16b,v2.16b,v3.16b
+ sli v0.4s,v12.4s,24
+ ushr v2.4s,v13.4s,32-24
+ eor v12.16b,v0.16b,v1.16b
+ sli v2.4s,v13.4s,24
+ eor v13.16b,v2.16b,v3.16b
+ eor v7.16b,v7.16b,v12.16b
+ eor v11.16b,v11.16b,v13.16b
+ subs w11,w11,#1
+ b.ne 10b
+#ifndef __AARCH64EB__
+ rev32 v3.16b,v4.16b
+#else
+ mov v3.16b,v4.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v2.16b,v5.16b
+#else
+ mov v2.16b,v5.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v1.16b,v6.16b
+#else
+ mov v1.16b,v6.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v0.16b,v7.16b
+#else
+ mov v0.16b,v7.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v7.16b,v8.16b
+#else
+ mov v7.16b,v8.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v6.16b,v9.16b
+#else
+ mov v6.16b,v9.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v5.16b,v10.16b
+#else
+ mov v5.16b,v10.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v11.16b
+#else
+ mov v4.16b,v11.16b
+#endif
+ ret
+.size _vpsm4_enc_8blks,.-_vpsm4_enc_8blks
+.globl vpsm4_set_encrypt_key
+.type vpsm4_set_encrypt_key,%function
+.align 5
+vpsm4_set_encrypt_key:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-16]!
+ mov w2,1
+ bl _vpsm4_set_key
+ ldp x29,x30,[sp],#16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size vpsm4_set_encrypt_key,.-vpsm4_set_encrypt_key
+.globl vpsm4_set_decrypt_key
+.type vpsm4_set_decrypt_key,%function
+.align 5
+vpsm4_set_decrypt_key:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-16]!
+ mov w2,0
+ bl _vpsm4_set_key
+ ldp x29,x30,[sp],#16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size vpsm4_set_decrypt_key,.-vpsm4_set_decrypt_key
+.globl vpsm4_encrypt
+.type vpsm4_encrypt,%function
+.align 5
+vpsm4_encrypt:
+ AARCH64_VALID_CALL_TARGET
+ ld1 {v4.4s},[x0]
+ adrp x10,.Lsbox
+ add x10,x10,#:lo12:.Lsbox
+ ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x10],#64
+ ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x10],#64
+ ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x10],#64
+ ld1 {v28.16b,v29.16b,v30.16b,v31.16b},[x10]
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+ mov x3,x2
+ mov x10,x3
+ mov w11,#8
+ mov w12,v4.s[0]
+ mov w13,v4.s[1]
+ mov w14,v4.s[2]
+ mov w15,v4.s[3]
+10:
+ ldp w7,w8,[x10],8
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ eor w6,w14,w15
+ eor w9,w7,w13
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w12,w12,w6
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ eor w6,w14,w15
+ eor w9,w12,w8
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ ldp w7,w8,[x10],8
+ eor w13,w13,w6
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ eor w6,w12,w13
+ eor w9,w7,w15
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w14,w14,w6
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ eor w6,w12,w13
+ eor w9,w14,w8
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w15,w15,w6
+ subs w11,w11,#1
+ b.ne 10b
+ mov v4.s[0],w15
+ mov v4.s[1],w14
+ mov v4.s[2],w13
+ mov v4.s[3],w12
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+ st1 {v4.4s},[x1]
+ ret
+.size vpsm4_encrypt,.-vpsm4_encrypt
+.globl vpsm4_decrypt
+.type vpsm4_decrypt,%function
+.align 5
+vpsm4_decrypt:
+ AARCH64_VALID_CALL_TARGET
+ ld1 {v4.4s},[x0]
+ adrp x10,.Lsbox
+ add x10,x10,#:lo12:.Lsbox
+ ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x10],#64
+ ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x10],#64
+ ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x10],#64
+ ld1 {v28.16b,v29.16b,v30.16b,v31.16b},[x10]
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+ mov x3,x2
+ mov x10,x3
+ mov w11,#8
+ mov w12,v4.s[0]
+ mov w13,v4.s[1]
+ mov w14,v4.s[2]
+ mov w15,v4.s[3]
+10:
+ ldp w7,w8,[x10],8
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ eor w6,w14,w15
+ eor w9,w7,w13
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w12,w12,w6
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ eor w6,w14,w15
+ eor w9,w12,w8
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ ldp w7,w8,[x10],8
+ eor w13,w13,w6
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ eor w6,w12,w13
+ eor w9,w7,w15
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w14,w14,w6
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ eor w6,w12,w13
+ eor w9,w14,w8
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w15,w15,w6
+ subs w11,w11,#1
+ b.ne 10b
+ mov v4.s[0],w15
+ mov v4.s[1],w14
+ mov v4.s[2],w13
+ mov v4.s[3],w12
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+ st1 {v4.4s},[x1]
+ ret
+.size vpsm4_decrypt,.-vpsm4_decrypt
+.globl vpsm4_ecb_encrypt
+.type vpsm4_ecb_encrypt,%function
+.align 5
+vpsm4_ecb_encrypt:
+ AARCH64_SIGN_LINK_REGISTER
+ // convert length into blocks
+ lsr x2,x2,4
+ stp d8,d9,[sp,#-80]!
+ stp d10,d11,[sp,#16]
+ stp d12,d13,[sp,#32]
+ stp d14,d15,[sp,#48]
+ stp x29,x30,[sp,#64]
+ adrp x10,.Lsbox
+ add x10,x10,#:lo12:.Lsbox
+ ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x10],#64
+ ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x10],#64
+ ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x10],#64
+ ld1 {v28.16b,v29.16b,v30.16b,v31.16b},[x10]
+.Lecb_8_blocks_process:
+ cmp w2,#8
+ b.lt .Lecb_4_blocks_process
+ ld4 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
+ ld4 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v5.16b,v5.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v6.16b,v6.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v7.16b,v7.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v8.16b,v8.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v9.16b,v9.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v10.16b,v10.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v11.16b,v11.16b
+#endif
+ bl _vpsm4_enc_8blks
+ st4 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
+ st4 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64
+ subs w2,w2,#8
+ b.gt .Lecb_8_blocks_process
+ b 100f
+.Lecb_4_blocks_process:
+ cmp w2,#4
+ b.lt 1f
+ ld4 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v5.16b,v5.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v6.16b,v6.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v7.16b,v7.16b
+#endif
+ bl _vpsm4_enc_4blks
+ st4 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
+ sub w2,w2,#4
+1:
+ // process last block
+ cmp w2,#1
+ b.lt 100f
+ b.gt 1f
+ ld1 {v4.4s},[x0]
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+ mov x10,x3
+ mov w11,#8
+ mov w12,v4.s[0]
+ mov w13,v4.s[1]
+ mov w14,v4.s[2]
+ mov w15,v4.s[3]
+10:
+ ldp w7,w8,[x10],8
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ eor w6,w14,w15
+ eor w9,w7,w13
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w12,w12,w6
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ eor w6,w14,w15
+ eor w9,w12,w8
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ ldp w7,w8,[x10],8
+ eor w13,w13,w6
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ eor w6,w12,w13
+ eor w9,w7,w15
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w14,w14,w6
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ eor w6,w12,w13
+ eor w9,w14,w8
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w15,w15,w6
+ subs w11,w11,#1
+ b.ne 10b
+ mov v4.s[0],w15
+ mov v4.s[1],w14
+ mov v4.s[2],w13
+ mov v4.s[3],w12
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+ st1 {v4.4s},[x1]
+ b 100f
+1: // process last 2 blocks
+ ld4 {v4.s,v5.s,v6.s,v7.s}[0],[x0],#16
+ ld4 {v4.s,v5.s,v6.s,v7.s}[1],[x0],#16
+ cmp w2,#2
+ b.gt 1f
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v5.16b,v5.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v6.16b,v6.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v7.16b,v7.16b
+#endif
+ bl _vpsm4_enc_4blks
+ st4 {v0.s,v1.s,v2.s,v3.s}[0],[x1],#16
+ st4 {v0.s,v1.s,v2.s,v3.s}[1],[x1]
+ b 100f
+1: // process last 3 blocks
+ ld4 {v4.s,v5.s,v6.s,v7.s}[2],[x0],#16
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v5.16b,v5.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v6.16b,v6.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v7.16b,v7.16b
+#endif
+ bl _vpsm4_enc_4blks
+ st4 {v0.s,v1.s,v2.s,v3.s}[0],[x1],#16
+ st4 {v0.s,v1.s,v2.s,v3.s}[1],[x1],#16
+ st4 {v0.s,v1.s,v2.s,v3.s}[2],[x1]
+100:
+ ldp d10,d11,[sp,#16]
+ ldp d12,d13,[sp,#32]
+ ldp d14,d15,[sp,#48]
+ ldp x29,x30,[sp,#64]
+ ldp d8,d9,[sp],#80
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size vpsm4_ecb_encrypt,.-vpsm4_ecb_encrypt
+.globl vpsm4_cbc_encrypt
+.type vpsm4_cbc_encrypt,%function
+.align 5
+vpsm4_cbc_encrypt:
+ AARCH64_VALID_CALL_TARGET
+ lsr x2,x2,4
+ adrp x10,.Lsbox
+ add x10,x10,#:lo12:.Lsbox
+ ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x10],#64
+ ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x10],#64
+ ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x10],#64
+ ld1 {v28.16b,v29.16b,v30.16b,v31.16b},[x10]
+ cbz w5,.Ldec
+ ld1 {v3.4s},[x4]
+.Lcbc_4_blocks_enc:
+ cmp w2,#4
+ b.lt 1f
+ ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
+ eor v4.16b,v4.16b,v3.16b
+#ifndef __AARCH64EB__
+ rev32 v5.16b,v5.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v6.16b,v6.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v7.16b,v7.16b
+#endif
+ mov x10,x3
+ mov w11,#8
+ mov w12,v4.s[0]
+ mov w13,v4.s[1]
+ mov w14,v4.s[2]
+ mov w15,v4.s[3]
+10:
+ ldp w7,w8,[x10],8
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ eor w6,w14,w15
+ eor w9,w7,w13
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w12,w12,w6
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ eor w6,w14,w15
+ eor w9,w12,w8
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ ldp w7,w8,[x10],8
+ eor w13,w13,w6
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ eor w6,w12,w13
+ eor w9,w7,w15
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w14,w14,w6
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ eor w6,w12,w13
+ eor w9,w14,w8
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w15,w15,w6
+ subs w11,w11,#1
+ b.ne 10b
+ mov v4.s[0],w15
+ mov v4.s[1],w14
+ mov v4.s[2],w13
+ mov v4.s[3],w12
+ eor v5.16b,v5.16b,v4.16b
+ mov x10,x3
+ mov w11,#8
+ mov w12,v5.s[0]
+ mov w13,v5.s[1]
+ mov w14,v5.s[2]
+ mov w15,v5.s[3]
+10:
+ ldp w7,w8,[x10],8
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ eor w6,w14,w15
+ eor w9,w7,w13
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w12,w12,w6
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ eor w6,w14,w15
+ eor w9,w12,w8
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ ldp w7,w8,[x10],8
+ eor w13,w13,w6
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ eor w6,w12,w13
+ eor w9,w7,w15
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w14,w14,w6
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ eor w6,w12,w13
+ eor w9,w14,w8
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w15,w15,w6
+ subs w11,w11,#1
+ b.ne 10b
+ mov v5.s[0],w15
+ mov v5.s[1],w14
+ mov v5.s[2],w13
+ mov v5.s[3],w12
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+ eor v6.16b,v6.16b,v5.16b
+ mov x10,x3
+ mov w11,#8
+ mov w12,v6.s[0]
+ mov w13,v6.s[1]
+ mov w14,v6.s[2]
+ mov w15,v6.s[3]
+10:
+ ldp w7,w8,[x10],8
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ eor w6,w14,w15
+ eor w9,w7,w13
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w12,w12,w6
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ eor w6,w14,w15
+ eor w9,w12,w8
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ ldp w7,w8,[x10],8
+ eor w13,w13,w6
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ eor w6,w12,w13
+ eor w9,w7,w15
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w14,w14,w6
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ eor w6,w12,w13
+ eor w9,w14,w8
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w15,w15,w6
+ subs w11,w11,#1
+ b.ne 10b
+ mov v6.s[0],w15
+ mov v6.s[1],w14
+ mov v6.s[2],w13
+ mov v6.s[3],w12
+#ifndef __AARCH64EB__
+ rev32 v5.16b,v5.16b
+#endif
+ eor v7.16b,v7.16b,v6.16b
+ mov x10,x3
+ mov w11,#8
+ mov w12,v7.s[0]
+ mov w13,v7.s[1]
+ mov w14,v7.s[2]
+ mov w15,v7.s[3]
+10:
+ ldp w7,w8,[x10],8
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ eor w6,w14,w15
+ eor w9,w7,w13
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w12,w12,w6
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ eor w6,w14,w15
+ eor w9,w12,w8
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ ldp w7,w8,[x10],8
+ eor w13,w13,w6
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ eor w6,w12,w13
+ eor w9,w7,w15
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w14,w14,w6
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ eor w6,w12,w13
+ eor w9,w14,w8
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w15,w15,w6
+ subs w11,w11,#1
+ b.ne 10b
+ mov v7.s[0],w15
+ mov v7.s[1],w14
+ mov v7.s[2],w13
+ mov v7.s[3],w12
+#ifndef __AARCH64EB__
+ rev32 v6.16b,v6.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v7.16b,v7.16b
+#endif
+ orr v3.16b,v7.16b,v7.16b
+ st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64
+ subs w2,w2,#4
+ b.ne .Lcbc_4_blocks_enc
+ b 2f
+1:
+ subs w2,w2,#1
+ b.lt 2f
+ ld1 {v4.4s},[x0],#16
+ eor v3.16b,v3.16b,v4.16b
+#ifndef __AARCH64EB__
+ rev32 v3.16b,v3.16b
+#endif
+ mov x10,x3
+ mov w11,#8
+ mov w12,v3.s[0]
+ mov w13,v3.s[1]
+ mov w14,v3.s[2]
+ mov w15,v3.s[3]
+10:
+ ldp w7,w8,[x10],8
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ eor w6,w14,w15
+ eor w9,w7,w13
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w12,w12,w6
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ eor w6,w14,w15
+ eor w9,w12,w8
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ ldp w7,w8,[x10],8
+ eor w13,w13,w6
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ eor w6,w12,w13
+ eor w9,w7,w15
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w14,w14,w6
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ eor w6,w12,w13
+ eor w9,w14,w8
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w15,w15,w6
+ subs w11,w11,#1
+ b.ne 10b
+ mov v3.s[0],w15
+ mov v3.s[1],w14
+ mov v3.s[2],w13
+ mov v3.s[3],w12
+#ifndef __AARCH64EB__
+ rev32 v3.16b,v3.16b
+#endif
+ st1 {v3.4s},[x1],#16
+ b 1b
+2:
+ // save back IV
+ st1 {v3.4s},[x4]
+ ret
+
+.Ldec:
+ // decryption mode starts
+ AARCH64_SIGN_LINK_REGISTER
+ stp d8,d9,[sp,#-80]!
+ stp d10,d11,[sp,#16]
+ stp d12,d13,[sp,#32]
+ stp d14,d15,[sp,#48]
+ stp x29,x30,[sp,#64]
+.Lcbc_8_blocks_dec:
+ cmp w2,#8
+ b.lt 1f
+ ld4 {v4.4s,v5.4s,v6.4s,v7.4s},[x0]
+ add x10,x0,#64
+ ld4 {v8.4s,v9.4s,v10.4s,v11.4s},[x10]
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v5.16b,v5.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v6.16b,v6.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v7.16b,v7.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v8.16b,v8.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v9.16b,v9.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v10.16b,v10.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v11.16b,v11.16b
+#endif
+ bl _vpsm4_enc_8blks
+ zip1 v8.4s,v0.4s,v1.4s
+ zip2 v9.4s,v0.4s,v1.4s
+ zip1 v10.4s,v2.4s,v3.4s
+ zip2 v11.4s,v2.4s,v3.4s
+ zip1 v0.2d,v8.2d,v10.2d
+ zip2 v1.2d,v8.2d,v10.2d
+ zip1 v2.2d,v9.2d,v11.2d
+ zip2 v3.2d,v9.2d,v11.2d
+ zip1 v8.4s,v4.4s,v5.4s
+ zip2 v9.4s,v4.4s,v5.4s
+ zip1 v10.4s,v6.4s,v7.4s
+ zip2 v11.4s,v6.4s,v7.4s
+ zip1 v4.2d,v8.2d,v10.2d
+ zip2 v5.2d,v8.2d,v10.2d
+ zip1 v6.2d,v9.2d,v11.2d
+ zip2 v7.2d,v9.2d,v11.2d
+ ld1 {v15.4s},[x4]
+ ld1 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64
+ // note ivec1 and vtmpx[3] are reusing the same register
+ // care needs to be taken to avoid conflict
+ eor v0.16b,v0.16b,v15.16b
+ ld1 {v12.4s,v13.4s,v14.4s,v15.4s},[x0],#64
+ eor v1.16b,v1.16b,v8.16b
+ eor v2.16b,v2.16b,v9.16b
+ eor v3.16b,v3.16b,v10.16b
+ // save back IV
+ st1 {v15.4s}, [x4]
+ eor v4.16b,v4.16b,v11.16b
+ eor v5.16b,v5.16b,v12.16b
+ eor v6.16b,v6.16b,v13.16b
+ eor v7.16b,v7.16b,v14.16b
+ st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
+ st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64
+ subs w2,w2,#8
+ b.gt .Lcbc_8_blocks_dec
+ b.eq 100f
+1:
+ ld1 {v15.4s},[x4]
+.Lcbc_4_blocks_dec:
+ cmp w2,#4
+ b.lt 1f
+ ld4 {v4.4s,v5.4s,v6.4s,v7.4s},[x0]
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v5.16b,v5.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v6.16b,v6.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v7.16b,v7.16b
+#endif
+ bl _vpsm4_enc_4blks
+ ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
+ zip1 v8.4s,v0.4s,v1.4s
+ zip2 v9.4s,v0.4s,v1.4s
+ zip1 v10.4s,v2.4s,v3.4s
+ zip2 v11.4s,v2.4s,v3.4s
+ zip1 v0.2d,v8.2d,v10.2d
+ zip2 v1.2d,v8.2d,v10.2d
+ zip1 v2.2d,v9.2d,v11.2d
+ zip2 v3.2d,v9.2d,v11.2d
+ eor v0.16b,v0.16b,v15.16b
+ eor v1.16b,v1.16b,v4.16b
+ orr v15.16b,v7.16b,v7.16b
+ eor v2.16b,v2.16b,v5.16b
+ eor v3.16b,v3.16b,v6.16b
+ st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
+ subs w2,w2,#4
+ b.gt .Lcbc_4_blocks_dec
+ // save back IV
+ st1 {v7.4s}, [x4]
+ b 100f
+1: // last block
+ subs w2,w2,#1
+ b.lt 100f
+ b.gt 1f
+ ld1 {v4.4s},[x0],#16
+ // save back IV
+ st1 {v4.4s}, [x4]
+#ifndef __AARCH64EB__
+ rev32 v8.16b,v4.16b
+#else
+ mov v8.16b,v4.16b
+#endif
+ mov x10,x3
+ mov w11,#8
+ mov w12,v8.s[0]
+ mov w13,v8.s[1]
+ mov w14,v8.s[2]
+ mov w15,v8.s[3]
+10:
+ ldp w7,w8,[x10],8
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ eor w6,w14,w15
+ eor w9,w7,w13
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w12,w12,w6
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ eor w6,w14,w15
+ eor w9,w12,w8
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ ldp w7,w8,[x10],8
+ eor w13,w13,w6
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ eor w6,w12,w13
+ eor w9,w7,w15
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w14,w14,w6
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ eor w6,w12,w13
+ eor w9,w14,w8
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w15,w15,w6
+ subs w11,w11,#1
+ b.ne 10b
+ mov v8.s[0],w15
+ mov v8.s[1],w14
+ mov v8.s[2],w13
+ mov v8.s[3],w12
+#ifndef __AARCH64EB__
+ rev32 v8.16b,v8.16b
+#endif
+ eor v8.16b,v8.16b,v15.16b
+ st1 {v8.4s},[x1],#16
+ b 100f
+1: // last two blocks
+ ld4 {v4.s,v5.s,v6.s,v7.s}[0],[x0]
+ add x10,x0,#16
+ ld4 {v4.s,v5.s,v6.s,v7.s}[1],[x10],#16
+ subs w2,w2,1
+ b.gt 1f
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v5.16b,v5.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v6.16b,v6.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v7.16b,v7.16b
+#endif
+ bl _vpsm4_enc_4blks
+ ld1 {v4.4s,v5.4s},[x0],#32
+ zip1 v8.4s,v0.4s,v1.4s
+ zip2 v9.4s,v0.4s,v1.4s
+ zip1 v10.4s,v2.4s,v3.4s
+ zip2 v11.4s,v2.4s,v3.4s
+ zip1 v0.2d,v8.2d,v10.2d
+ zip2 v1.2d,v8.2d,v10.2d
+ zip1 v2.2d,v9.2d,v11.2d
+ zip2 v3.2d,v9.2d,v11.2d
+ eor v0.16b,v0.16b,v15.16b
+ eor v1.16b,v1.16b,v4.16b
+ st1 {v0.4s,v1.4s},[x1],#32
+ // save back IV
+ st1 {v5.4s}, [x4]
+ b 100f
+1: // last 3 blocks
+ ld4 {v4.s,v5.s,v6.s,v7.s}[2],[x10]
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v5.16b,v5.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v6.16b,v6.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v7.16b,v7.16b
+#endif
+ bl _vpsm4_enc_4blks
+ ld1 {v4.4s,v5.4s,v6.4s},[x0],#48
+ zip1 v8.4s,v0.4s,v1.4s
+ zip2 v9.4s,v0.4s,v1.4s
+ zip1 v10.4s,v2.4s,v3.4s
+ zip2 v11.4s,v2.4s,v3.4s
+ zip1 v0.2d,v8.2d,v10.2d
+ zip2 v1.2d,v8.2d,v10.2d
+ zip1 v2.2d,v9.2d,v11.2d
+ zip2 v3.2d,v9.2d,v11.2d
+ eor v0.16b,v0.16b,v15.16b
+ eor v1.16b,v1.16b,v4.16b
+ eor v2.16b,v2.16b,v5.16b
+ st1 {v0.4s,v1.4s,v2.4s},[x1],#48
+ // save back IV
+ st1 {v6.4s}, [x4]
+100:
+ ldp d10,d11,[sp,#16]
+ ldp d12,d13,[sp,#32]
+ ldp d14,d15,[sp,#48]
+ ldp x29,x30,[sp,#64]
+ ldp d8,d9,[sp],#80
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size vpsm4_cbc_encrypt,.-vpsm4_cbc_encrypt
+.globl vpsm4_ctr32_encrypt_blocks
+.type vpsm4_ctr32_encrypt_blocks,%function
+.align 5
+vpsm4_ctr32_encrypt_blocks:
+ AARCH64_VALID_CALL_TARGET
+ ld1 {v3.4s},[x4]
+#ifndef __AARCH64EB__
+ rev32 v3.16b,v3.16b
+#endif
+ adrp x10,.Lsbox
+ add x10,x10,#:lo12:.Lsbox
+ ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x10],#64
+ ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x10],#64
+ ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x10],#64
+ ld1 {v28.16b,v29.16b,v30.16b,v31.16b},[x10]
+ cmp w2,#1
+ b.ne 1f
+ // fast processing for one single block without
+ // context saving overhead
+ mov x10,x3
+ mov w11,#8
+ mov w12,v3.s[0]
+ mov w13,v3.s[1]
+ mov w14,v3.s[2]
+ mov w15,v3.s[3]
+10:
+ ldp w7,w8,[x10],8
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ eor w6,w14,w15
+ eor w9,w7,w13
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w12,w12,w6
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ eor w6,w14,w15
+ eor w9,w12,w8
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ ldp w7,w8,[x10],8
+ eor w13,w13,w6
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ eor w6,w12,w13
+ eor w9,w7,w15
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w14,w14,w6
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ eor w6,w12,w13
+ eor w9,w14,w8
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w15,w15,w6
+ subs w11,w11,#1
+ b.ne 10b
+ mov v3.s[0],w15
+ mov v3.s[1],w14
+ mov v3.s[2],w13
+ mov v3.s[3],w12
+#ifndef __AARCH64EB__
+ rev32 v3.16b,v3.16b
+#endif
+ ld1 {v4.4s},[x0]
+ eor v4.16b,v4.16b,v3.16b
+ st1 {v4.4s},[x1]
+ ret
+1:
+ AARCH64_SIGN_LINK_REGISTER
+ stp d8,d9,[sp,#-80]!
+ stp d10,d11,[sp,#16]
+ stp d12,d13,[sp,#32]
+ stp d14,d15,[sp,#48]
+ stp x29,x30,[sp,#64]
+ mov w12,v3.s[0]
+ mov w13,v3.s[1]
+ mov w14,v3.s[2]
+ mov w5,v3.s[3]
+.Lctr32_4_blocks_process:
+ cmp w2,#4
+ b.lt 1f
+ dup v4.4s,w12
+ dup v5.4s,w13
+ dup v6.4s,w14
+ mov v7.s[0],w5
+ add w5,w5,#1
+ mov v7.s[1],w5
+ add w5,w5,#1
+ mov v7.s[2],w5
+ add w5,w5,#1
+ mov v7.s[3],w5
+ add w5,w5,#1
+ cmp w2,#8
+ b.ge .Lctr32_8_blocks_process
+ bl _vpsm4_enc_4blks
+ ld4 {v12.4s,v13.4s,v14.4s,v15.4s},[x0],#64
+ eor v0.16b,v0.16b,v12.16b
+ eor v1.16b,v1.16b,v13.16b
+ eor v2.16b,v2.16b,v14.16b
+ eor v3.16b,v3.16b,v15.16b
+ st4 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
+ subs w2,w2,#4
+ b.ne .Lctr32_4_blocks_process
+ b 100f
+.Lctr32_8_blocks_process:
+ dup v8.4s,w12
+ dup v9.4s,w13
+ dup v10.4s,w14
+ mov v11.s[0],w5
+ add w5,w5,#1
+ mov v11.s[1],w5
+ add w5,w5,#1
+ mov v11.s[2],w5
+ add w5,w5,#1
+ mov v11.s[3],w5
+ add w5,w5,#1
+ bl _vpsm4_enc_8blks
+ ld4 {v12.4s,v13.4s,v14.4s,v15.4s},[x0],#64
+ ld4 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64
+ eor v0.16b,v0.16b,v12.16b
+ eor v1.16b,v1.16b,v13.16b
+ eor v2.16b,v2.16b,v14.16b
+ eor v3.16b,v3.16b,v15.16b
+ eor v4.16b,v4.16b,v8.16b
+ eor v5.16b,v5.16b,v9.16b
+ eor v6.16b,v6.16b,v10.16b
+ eor v7.16b,v7.16b,v11.16b
+ st4 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
+ st4 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64
+ subs w2,w2,#8
+ b.ne .Lctr32_4_blocks_process
+ b 100f
+1: // last block processing
+ subs w2,w2,#1
+ b.lt 100f
+ b.gt 1f
+ mov v3.s[0],w12
+ mov v3.s[1],w13
+ mov v3.s[2],w14
+ mov v3.s[3],w5
+ mov x10,x3
+ mov w11,#8
+ mov w12,v3.s[0]
+ mov w13,v3.s[1]
+ mov w14,v3.s[2]
+ mov w15,v3.s[3]
+10:
+ ldp w7,w8,[x10],8
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ eor w6,w14,w15
+ eor w9,w7,w13
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w12,w12,w6
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ eor w6,w14,w15
+ eor w9,w12,w8
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ ldp w7,w8,[x10],8
+ eor w13,w13,w6
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ eor w6,w12,w13
+ eor w9,w7,w15
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w14,w14,w6
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ eor w6,w12,w13
+ eor w9,w14,w8
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w15,w15,w6
+ subs w11,w11,#1
+ b.ne 10b
+ mov v3.s[0],w15
+ mov v3.s[1],w14
+ mov v3.s[2],w13
+ mov v3.s[3],w12
+#ifndef __AARCH64EB__
+ rev32 v3.16b,v3.16b
+#endif
+ ld1 {v4.4s},[x0]
+ eor v4.16b,v4.16b,v3.16b
+ st1 {v4.4s},[x1]
+ b 100f
+1: // last 2 blocks processing
+ dup v4.4s,w12
+ dup v5.4s,w13
+ dup v6.4s,w14
+ mov v7.s[0],w5
+ add w5,w5,#1
+ mov v7.s[1],w5
+ subs w2,w2,#1
+ b.ne 1f
+ bl _vpsm4_enc_4blks
+ ld4 {v12.s,v13.s,v14.s,v15.s}[0],[x0],#16
+ ld4 {v12.s,v13.s,v14.s,v15.s}[1],[x0],#16
+ eor v0.16b,v0.16b,v12.16b
+ eor v1.16b,v1.16b,v13.16b
+ eor v2.16b,v2.16b,v14.16b
+ eor v3.16b,v3.16b,v15.16b
+ st4 {v0.s,v1.s,v2.s,v3.s}[0],[x1],#16
+ st4 {v0.s,v1.s,v2.s,v3.s}[1],[x1],#16
+ b 100f
+1: // last 3 blocks processing
+ add w5,w5,#1
+ mov v7.s[2],w5
+ bl _vpsm4_enc_4blks
+ ld4 {v12.s,v13.s,v14.s,v15.s}[0],[x0],#16
+ ld4 {v12.s,v13.s,v14.s,v15.s}[1],[x0],#16
+ ld4 {v12.s,v13.s,v14.s,v15.s}[2],[x0],#16
+ eor v0.16b,v0.16b,v12.16b
+ eor v1.16b,v1.16b,v13.16b
+ eor v2.16b,v2.16b,v14.16b
+ eor v3.16b,v3.16b,v15.16b
+ st4 {v0.s,v1.s,v2.s,v3.s}[0],[x1],#16
+ st4 {v0.s,v1.s,v2.s,v3.s}[1],[x1],#16
+ st4 {v0.s,v1.s,v2.s,v3.s}[2],[x1],#16
+100:
+ ldp d10,d11,[sp,#16]
+ ldp d12,d13,[sp,#32]
+ ldp d14,d15,[sp,#48]
+ ldp x29,x30,[sp,#64]
+ ldp d8,d9,[sp],#80
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size vpsm4_ctr32_encrypt_blocks,.-vpsm4_ctr32_encrypt_blocks
+.globl vpsm4_xts_encrypt_gb
+.type vpsm4_xts_encrypt_gb,%function
+.align 5
+vpsm4_xts_encrypt_gb:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x15, x16, [sp, #-0x10]!
+ stp x17, x18, [sp, #-0x10]!
+ stp x19, x20, [sp, #-0x10]!
+ stp x21, x22, [sp, #-0x10]!
+ stp x23, x24, [sp, #-0x10]!
+ stp x25, x26, [sp, #-0x10]!
+ stp x27, x28, [sp, #-0x10]!
+ stp x29, x30, [sp, #-0x10]!
+ stp d8, d9, [sp, #-0x10]!
+ stp d10, d11, [sp, #-0x10]!
+ stp d12, d13, [sp, #-0x10]!
+ stp d14, d15, [sp, #-0x10]!
+ mov x26,x3
+ mov x27,x4
+ mov w28,w6
+ ld1 {v8.4s}, [x5]
+ mov x3,x27
+ adrp x10,.Lsbox
+ add x10,x10,#:lo12:.Lsbox
+ ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x10],#64
+ ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x10],#64
+ ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x10],#64
+ ld1 {v28.16b,v29.16b,v30.16b,v31.16b},[x10]
+#ifndef __AARCH64EB__
+ rev32 v8.16b,v8.16b
+#endif
+ mov x10,x3
+ mov w11,#8
+ mov w12,v8.s[0]
+ mov w13,v8.s[1]
+ mov w14,v8.s[2]
+ mov w15,v8.s[3]
+10:
+ ldp w7,w8,[x10],8
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ eor w6,w14,w15
+ eor w9,w7,w13
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w12,w12,w6
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ eor w6,w14,w15
+ eor w9,w12,w8
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ ldp w7,w8,[x10],8
+ eor w13,w13,w6
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ eor w6,w12,w13
+ eor w9,w7,w15
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w14,w14,w6
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ eor w6,w12,w13
+ eor w9,w14,w8
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w15,w15,w6
+ subs w11,w11,#1
+ b.ne 10b
+ mov v8.s[0],w15
+ mov v8.s[1],w14
+ mov v8.s[2],w13
+ mov v8.s[3],w12
+#ifndef __AARCH64EB__
+ rev32 v8.16b,v8.16b
+#endif
+ mov x3,x26
+ and x29,x2,#0x0F
+ // convert length into blocks
+ lsr x2,x2,4
+ cmp x2,#1
+ b.lt .return_gb
+
+ cmp x29,0
+ // If the encryption/decryption Length is N times of 16,
+ // the all blocks are encrypted/decrypted in .xts_encrypt_blocks_gb
+ b.eq .xts_encrypt_blocks_gb
+
+ // If the encryption/decryption length is not N times of 16,
+ // the last two blocks are encrypted/decrypted in .last_2blks_tweak_gb or .only_2blks_tweak_gb
+ // the other blocks are encrypted/decrypted in .xts_encrypt_blocks_gb
+ subs x2,x2,#1
+ b.eq .only_2blks_tweak_gb
+.xts_encrypt_blocks_gb:
+ rbit v8.16b,v8.16b
+#ifdef __AARCH64EB__
+ rev32 v8.16b,v8.16b
+#endif
+ mov x12,v8.d[0]
+ mov x13,v8.d[1]
+ mov w7,0x87
+ extr x9,x13,x13,#32
+ extr x15,x13,x12,#63
+ and w8,w7,w9,asr#31
+ eor x14,x8,x12,lsl#1
+ mov w7,0x87
+ extr x9,x15,x15,#32
+ extr x17,x15,x14,#63
+ and w8,w7,w9,asr#31
+ eor x16,x8,x14,lsl#1
+ mov w7,0x87
+ extr x9,x17,x17,#32
+ extr x19,x17,x16,#63
+ and w8,w7,w9,asr#31
+ eor x18,x8,x16,lsl#1
+ mov w7,0x87
+ extr x9,x19,x19,#32
+ extr x21,x19,x18,#63
+ and w8,w7,w9,asr#31
+ eor x20,x8,x18,lsl#1
+ mov w7,0x87
+ extr x9,x21,x21,#32
+ extr x23,x21,x20,#63
+ and w8,w7,w9,asr#31
+ eor x22,x8,x20,lsl#1
+ mov w7,0x87
+ extr x9,x23,x23,#32
+ extr x25,x23,x22,#63
+ and w8,w7,w9,asr#31
+ eor x24,x8,x22,lsl#1
+ mov w7,0x87
+ extr x9,x25,x25,#32
+ extr x27,x25,x24,#63
+ and w8,w7,w9,asr#31
+ eor x26,x8,x24,lsl#1
+.Lxts_8_blocks_process_gb:
+ cmp x2,#8
+ b.lt .Lxts_4_blocks_process_gb
+ mov v0.d[0],x12
+ mov v0.d[1],x13
+#ifdef __AARCH64EB__
+ rev32 v0.16b,v0.16b
+#endif
+ mov v1.d[0],x14
+ mov v1.d[1],x15
+#ifdef __AARCH64EB__
+ rev32 v1.16b,v1.16b
+#endif
+ mov v2.d[0],x16
+ mov v2.d[1],x17
+#ifdef __AARCH64EB__
+ rev32 v2.16b,v2.16b
+#endif
+ mov v3.d[0],x18
+ mov v3.d[1],x19
+#ifdef __AARCH64EB__
+ rev32 v3.16b,v3.16b
+#endif
+ mov v12.d[0],x20
+ mov v12.d[1],x21
+#ifdef __AARCH64EB__
+ rev32 v12.16b,v12.16b
+#endif
+ mov v13.d[0],x22
+ mov v13.d[1],x23
+#ifdef __AARCH64EB__
+ rev32 v13.16b,v13.16b
+#endif
+ mov v14.d[0],x24
+ mov v14.d[1],x25
+#ifdef __AARCH64EB__
+ rev32 v14.16b,v14.16b
+#endif
+ mov v15.d[0],x26
+ mov v15.d[1],x27
+#ifdef __AARCH64EB__
+ rev32 v15.16b,v15.16b
+#endif
+ ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
+ rbit v0.16b,v0.16b
+ rbit v1.16b,v1.16b
+ rbit v2.16b,v2.16b
+ rbit v3.16b,v3.16b
+ eor v4.16b, v4.16b, v0.16b
+ eor v5.16b, v5.16b, v1.16b
+ eor v6.16b, v6.16b, v2.16b
+ eor v7.16b, v7.16b, v3.16b
+ ld1 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64
+ rbit v12.16b,v12.16b
+ rbit v13.16b,v13.16b
+ rbit v14.16b,v14.16b
+ rbit v15.16b,v15.16b
+ eor v8.16b, v8.16b, v12.16b
+ eor v9.16b, v9.16b, v13.16b
+ eor v10.16b, v10.16b, v14.16b
+ eor v11.16b, v11.16b, v15.16b
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v5.16b,v5.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v6.16b,v6.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v7.16b,v7.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v8.16b,v8.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v9.16b,v9.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v10.16b,v10.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v11.16b,v11.16b
+#endif
+ zip1 v0.4s,v4.4s,v5.4s
+ zip2 v1.4s,v4.4s,v5.4s
+ zip1 v2.4s,v6.4s,v7.4s
+ zip2 v3.4s,v6.4s,v7.4s
+ zip1 v4.2d,v0.2d,v2.2d
+ zip2 v5.2d,v0.2d,v2.2d
+ zip1 v6.2d,v1.2d,v3.2d
+ zip2 v7.2d,v1.2d,v3.2d
+ zip1 v0.4s,v8.4s,v9.4s
+ zip2 v1.4s,v8.4s,v9.4s
+ zip1 v2.4s,v10.4s,v11.4s
+ zip2 v3.4s,v10.4s,v11.4s
+ zip1 v8.2d,v0.2d,v2.2d
+ zip2 v9.2d,v0.2d,v2.2d
+ zip1 v10.2d,v1.2d,v3.2d
+ zip2 v11.2d,v1.2d,v3.2d
+ bl _vpsm4_enc_8blks
+ zip1 v8.4s,v0.4s,v1.4s
+ zip2 v9.4s,v0.4s,v1.4s
+ zip1 v10.4s,v2.4s,v3.4s
+ zip2 v11.4s,v2.4s,v3.4s
+ zip1 v0.2d,v8.2d,v10.2d
+ zip2 v1.2d,v8.2d,v10.2d
+ zip1 v2.2d,v9.2d,v11.2d
+ zip2 v3.2d,v9.2d,v11.2d
+ zip1 v8.4s,v4.4s,v5.4s
+ zip2 v9.4s,v4.4s,v5.4s
+ zip1 v10.4s,v6.4s,v7.4s
+ zip2 v11.4s,v6.4s,v7.4s
+ zip1 v4.2d,v8.2d,v10.2d
+ zip2 v5.2d,v8.2d,v10.2d
+ zip1 v6.2d,v9.2d,v11.2d
+ zip2 v7.2d,v9.2d,v11.2d
+ mov v12.d[0],x12
+ mov v12.d[1],x13
+#ifdef __AARCH64EB__
+ rev32 v12.16b,v12.16b
+#endif
+ mov w7,0x87
+ extr x9,x27,x27,#32
+ extr x13,x27,x26,#63
+ and w8,w7,w9,asr#31
+ eor x12,x8,x26,lsl#1
+ mov v13.d[0],x14
+ mov v13.d[1],x15
+#ifdef __AARCH64EB__
+ rev32 v13.16b,v13.16b
+#endif
+ mov w7,0x87
+ extr x9,x13,x13,#32
+ extr x15,x13,x12,#63
+ and w8,w7,w9,asr#31
+ eor x14,x8,x12,lsl#1
+ mov v14.d[0],x16
+ mov v14.d[1],x17
+#ifdef __AARCH64EB__
+ rev32 v14.16b,v14.16b
+#endif
+ mov w7,0x87
+ extr x9,x15,x15,#32
+ extr x17,x15,x14,#63
+ and w8,w7,w9,asr#31
+ eor x16,x8,x14,lsl#1
+ mov v15.d[0],x18
+ mov v15.d[1],x19
+#ifdef __AARCH64EB__
+ rev32 v15.16b,v15.16b
+#endif
+ mov w7,0x87
+ extr x9,x17,x17,#32
+ extr x19,x17,x16,#63
+ and w8,w7,w9,asr#31
+ eor x18,x8,x16,lsl#1
+ mov v8.d[0],x20
+ mov v8.d[1],x21
+#ifdef __AARCH64EB__
+ rev32 v8.16b,v8.16b
+#endif
+ mov w7,0x87
+ extr x9,x19,x19,#32
+ extr x21,x19,x18,#63
+ and w8,w7,w9,asr#31
+ eor x20,x8,x18,lsl#1
+ mov v9.d[0],x22
+ mov v9.d[1],x23
+#ifdef __AARCH64EB__
+ rev32 v9.16b,v9.16b
+#endif
+ mov w7,0x87
+ extr x9,x21,x21,#32
+ extr x23,x21,x20,#63
+ and w8,w7,w9,asr#31
+ eor x22,x8,x20,lsl#1
+ mov v10.d[0],x24
+ mov v10.d[1],x25
+#ifdef __AARCH64EB__
+ rev32 v10.16b,v10.16b
+#endif
+ mov w7,0x87
+ extr x9,x23,x23,#32
+ extr x25,x23,x22,#63
+ and w8,w7,w9,asr#31
+ eor x24,x8,x22,lsl#1
+ mov v11.d[0],x26
+ mov v11.d[1],x27
+#ifdef __AARCH64EB__
+ rev32 v11.16b,v11.16b
+#endif
+ mov w7,0x87
+ extr x9,x25,x25,#32
+ extr x27,x25,x24,#63
+ and w8,w7,w9,asr#31
+ eor x26,x8,x24,lsl#1
+ eor v0.16b, v0.16b, v12.16b
+ eor v1.16b, v1.16b, v13.16b
+ eor v2.16b, v2.16b, v14.16b
+ eor v3.16b, v3.16b, v15.16b
+ eor v4.16b, v4.16b, v8.16b
+ eor v5.16b, v5.16b, v9.16b
+ eor v6.16b, v6.16b, v10.16b
+ eor v7.16b, v7.16b, v11.16b
+
+ // save the last tweak
+ st1 {v11.4s},[x5]
+ st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
+ st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64
+ subs x2,x2,#8
+ b.gt .Lxts_8_blocks_process_gb
+ b 100f
+.Lxts_4_blocks_process_gb:
+ mov v8.d[0],x12
+ mov v8.d[1],x13
+#ifdef __AARCH64EB__
+ rev32 v8.16b,v8.16b
+#endif
+ mov v9.d[0],x14
+ mov v9.d[1],x15
+#ifdef __AARCH64EB__
+ rev32 v9.16b,v9.16b
+#endif
+ mov v10.d[0],x16
+ mov v10.d[1],x17
+#ifdef __AARCH64EB__
+ rev32 v10.16b,v10.16b
+#endif
+ mov v11.d[0],x18
+ mov v11.d[1],x19
+#ifdef __AARCH64EB__
+ rev32 v11.16b,v11.16b
+#endif
+ cmp x2,#4
+ b.lt 1f
+ ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
+ rbit v8.16b,v8.16b
+ rbit v9.16b,v9.16b
+ rbit v10.16b,v10.16b
+ rbit v11.16b,v11.16b
+ eor v4.16b, v4.16b, v8.16b
+ eor v5.16b, v5.16b, v9.16b
+ eor v6.16b, v6.16b, v10.16b
+ eor v7.16b, v7.16b, v11.16b
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v5.16b,v5.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v6.16b,v6.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v7.16b,v7.16b
+#endif
+ zip1 v0.4s,v4.4s,v5.4s
+ zip2 v1.4s,v4.4s,v5.4s
+ zip1 v2.4s,v6.4s,v7.4s
+ zip2 v3.4s,v6.4s,v7.4s
+ zip1 v4.2d,v0.2d,v2.2d
+ zip2 v5.2d,v0.2d,v2.2d
+ zip1 v6.2d,v1.2d,v3.2d
+ zip2 v7.2d,v1.2d,v3.2d
+ bl _vpsm4_enc_4blks
+ zip1 v4.4s,v0.4s,v1.4s
+ zip2 v5.4s,v0.4s,v1.4s
+ zip1 v6.4s,v2.4s,v3.4s
+ zip2 v7.4s,v2.4s,v3.4s
+ zip1 v0.2d,v4.2d,v6.2d
+ zip2 v1.2d,v4.2d,v6.2d
+ zip1 v2.2d,v5.2d,v7.2d
+ zip2 v3.2d,v5.2d,v7.2d
+ eor v0.16b, v0.16b, v8.16b
+ eor v1.16b, v1.16b, v9.16b
+ eor v2.16b, v2.16b, v10.16b
+ eor v3.16b, v3.16b, v11.16b
+ st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
+ sub x2,x2,#4
+ mov v8.d[0],x20
+ mov v8.d[1],x21
+#ifdef __AARCH64EB__
+ rev32 v8.16b,v8.16b
+#endif
+ mov v9.d[0],x22
+ mov v9.d[1],x23
+#ifdef __AARCH64EB__
+ rev32 v9.16b,v9.16b
+#endif
+ mov v10.d[0],x24
+ mov v10.d[1],x25
+#ifdef __AARCH64EB__
+ rev32 v10.16b,v10.16b
+#endif
+ // save the last tweak
+ st1 {v11.4s},[x5]
+1:
+ // process last block
+ cmp x2,#1
+ b.lt 100f
+ b.gt 1f
+ ld1 {v4.4s},[x0],#16
+ rbit v8.16b,v8.16b
+ eor v4.16b, v4.16b, v8.16b
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+ mov x10,x3
+ mov w11,#8
+ mov w12,v4.s[0]
+ mov w13,v4.s[1]
+ mov w14,v4.s[2]
+ mov w15,v4.s[3]
+10:
+ ldp w7,w8,[x10],8
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ eor w6,w14,w15
+ eor w9,w7,w13
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w12,w12,w6
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ eor w6,w14,w15
+ eor w9,w12,w8
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ ldp w7,w8,[x10],8
+ eor w13,w13,w6
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ eor w6,w12,w13
+ eor w9,w7,w15
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w14,w14,w6
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ eor w6,w12,w13
+ eor w9,w14,w8
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w15,w15,w6
+ subs w11,w11,#1
+ b.ne 10b
+ mov v4.s[0],w15
+ mov v4.s[1],w14
+ mov v4.s[2],w13
+ mov v4.s[3],w12
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+ eor v4.16b, v4.16b, v8.16b
+ st1 {v4.4s},[x1],#16
+ // save the last tweak
+ st1 {v8.4s},[x5]
+ b 100f
+1: // process last 2 blocks
+ cmp x2,#2
+ b.gt 1f
+ ld1 {v4.4s,v5.4s},[x0],#32
+ rbit v8.16b,v8.16b
+ rbit v9.16b,v9.16b
+ eor v4.16b, v4.16b, v8.16b
+ eor v5.16b, v5.16b, v9.16b
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v5.16b,v5.16b
+#endif
+ zip1 v0.4s,v4.4s,v5.4s
+ zip2 v1.4s,v4.4s,v5.4s
+ zip1 v2.4s,v6.4s,v7.4s
+ zip2 v3.4s,v6.4s,v7.4s
+ zip1 v4.2d,v0.2d,v2.2d
+ zip2 v5.2d,v0.2d,v2.2d
+ zip1 v6.2d,v1.2d,v3.2d
+ zip2 v7.2d,v1.2d,v3.2d
+ bl _vpsm4_enc_4blks
+ zip1 v4.4s,v0.4s,v1.4s
+ zip2 v5.4s,v0.4s,v1.4s
+ zip1 v6.4s,v2.4s,v3.4s
+ zip2 v7.4s,v2.4s,v3.4s
+ zip1 v0.2d,v4.2d,v6.2d
+ zip2 v1.2d,v4.2d,v6.2d
+ zip1 v2.2d,v5.2d,v7.2d
+ zip2 v3.2d,v5.2d,v7.2d
+ eor v0.16b, v0.16b, v8.16b
+ eor v1.16b, v1.16b, v9.16b
+ st1 {v0.4s,v1.4s},[x1],#32
+ // save the last tweak
+ st1 {v9.4s},[x5]
+ b 100f
+1: // process last 3 blocks
+ ld1 {v4.4s,v5.4s,v6.4s},[x0],#48
+ rbit v8.16b,v8.16b
+ rbit v9.16b,v9.16b
+ rbit v10.16b,v10.16b
+ eor v4.16b, v4.16b, v8.16b
+ eor v5.16b, v5.16b, v9.16b
+ eor v6.16b, v6.16b, v10.16b
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v5.16b,v5.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v6.16b,v6.16b
+#endif
+ zip1 v0.4s,v4.4s,v5.4s
+ zip2 v1.4s,v4.4s,v5.4s
+ zip1 v2.4s,v6.4s,v7.4s
+ zip2 v3.4s,v6.4s,v7.4s
+ zip1 v4.2d,v0.2d,v2.2d
+ zip2 v5.2d,v0.2d,v2.2d
+ zip1 v6.2d,v1.2d,v3.2d
+ zip2 v7.2d,v1.2d,v3.2d
+ bl _vpsm4_enc_4blks
+ zip1 v4.4s,v0.4s,v1.4s
+ zip2 v5.4s,v0.4s,v1.4s
+ zip1 v6.4s,v2.4s,v3.4s
+ zip2 v7.4s,v2.4s,v3.4s
+ zip1 v0.2d,v4.2d,v6.2d
+ zip2 v1.2d,v4.2d,v6.2d
+ zip1 v2.2d,v5.2d,v7.2d
+ zip2 v3.2d,v5.2d,v7.2d
+ eor v0.16b, v0.16b, v8.16b
+ eor v1.16b, v1.16b, v9.16b
+ eor v2.16b, v2.16b, v10.16b
+ st1 {v0.4s,v1.4s,v2.4s},[x1],#48
+ // save the last tweak
+ st1 {v10.4s},[x5]
+100:
+ cmp x29,0
+ b.eq .return_gb
+
+// This branch calculates the last two tweaks,
+// while the encryption/decryption length is larger than 32
+.last_2blks_tweak_gb:
+ ld1 {v8.4s},[x5]
+#ifdef __AARCH64EB__
+ rev32 v8.16b,v8.16b
+#endif
+ rbit v2.16b,v8.16b
+ adrp x10,.Lxts_magic
+ ldr q0, [x10, #:lo12:.Lxts_magic]
+ shl v9.16b, v2.16b, #1
+ ext v1.16b, v2.16b, v2.16b,#15
+ ushr v1.16b, v1.16b, #7
+ mul v1.16b, v1.16b, v0.16b
+ eor v9.16b, v9.16b, v1.16b
+ rbit v9.16b,v9.16b
+ rbit v2.16b,v9.16b
+ adrp x10,.Lxts_magic
+ ldr q0, [x10, #:lo12:.Lxts_magic]
+ shl v10.16b, v2.16b, #1
+ ext v1.16b, v2.16b, v2.16b,#15
+ ushr v1.16b, v1.16b, #7
+ mul v1.16b, v1.16b, v0.16b
+ eor v10.16b, v10.16b, v1.16b
+ rbit v10.16b,v10.16b
+ b .check_dec_gb
+
+
+// This branch calculates the last two tweaks,
+// while the encryption/decryption length is equal to 32, who only need two tweaks
+.only_2blks_tweak_gb:
+ mov v9.16b,v8.16b
+#ifdef __AARCH64EB__
+ rev32 v9.16b,v9.16b
+#endif
+ rbit v2.16b,v9.16b
+ adrp x10,.Lxts_magic
+ ldr q0, [x10, #:lo12:.Lxts_magic]
+ shl v10.16b, v2.16b, #1
+ ext v1.16b, v2.16b, v2.16b,#15
+ ushr v1.16b, v1.16b, #7
+ mul v1.16b, v1.16b, v0.16b
+ eor v10.16b, v10.16b, v1.16b
+ rbit v10.16b,v10.16b
+ b .check_dec_gb
+
+
+// Determine whether encryption or decryption is required.
+// The last two tweaks need to be swapped for decryption.
+.check_dec_gb:
+ // encryption:1 decryption:0
+ cmp w28,1
+ b.eq .process_last_2blks_gb
+ mov v0.16B,v9.16b
+ mov v9.16B,v10.16b
+ mov v10.16B,v0.16b
+
+.process_last_2blks_gb:
+#ifdef __AARCH64EB__
+ rev32 v9.16b,v9.16b
+#endif
+#ifdef __AARCH64EB__
+ rev32 v10.16b,v10.16b
+#endif
+ ld1 {v4.4s},[x0],#16
+ eor v4.16b, v4.16b, v9.16b
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+ mov x10,x3
+ mov w11,#8
+ mov w12,v4.s[0]
+ mov w13,v4.s[1]
+ mov w14,v4.s[2]
+ mov w15,v4.s[3]
+10:
+ ldp w7,w8,[x10],8
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ eor w6,w14,w15
+ eor w9,w7,w13
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w12,w12,w6
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ eor w6,w14,w15
+ eor w9,w12,w8
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ ldp w7,w8,[x10],8
+ eor w13,w13,w6
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ eor w6,w12,w13
+ eor w9,w7,w15
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w14,w14,w6
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ eor w6,w12,w13
+ eor w9,w14,w8
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w15,w15,w6
+ subs w11,w11,#1
+ b.ne 10b
+ mov v4.s[0],w15
+ mov v4.s[1],w14
+ mov v4.s[2],w13
+ mov v4.s[3],w12
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+ eor v4.16b, v4.16b, v9.16b
+ st1 {v4.4s},[x1],#16
+
+ sub x26,x1,16
+.loop_gb:
+ subs x29,x29,1
+ ldrb w7,[x26,x29]
+ ldrb w8,[x0,x29]
+ strb w8,[x26,x29]
+ strb w7,[x1,x29]
+ b.gt .loop_gb
+ ld1 {v4.4s}, [x26]
+ eor v4.16b, v4.16b, v10.16b
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+ mov x10,x3
+ mov w11,#8
+ mov w12,v4.s[0]
+ mov w13,v4.s[1]
+ mov w14,v4.s[2]
+ mov w15,v4.s[3]
+10:
+ ldp w7,w8,[x10],8
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ eor w6,w14,w15
+ eor w9,w7,w13
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w12,w12,w6
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ eor w6,w14,w15
+ eor w9,w12,w8
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ ldp w7,w8,[x10],8
+ eor w13,w13,w6
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ eor w6,w12,w13
+ eor w9,w7,w15
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w14,w14,w6
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ eor w6,w12,w13
+ eor w9,w14,w8
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w15,w15,w6
+ subs w11,w11,#1
+ b.ne 10b
+ mov v4.s[0],w15
+ mov v4.s[1],w14
+ mov v4.s[2],w13
+ mov v4.s[3],w12
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+ eor v4.16b, v4.16b, v10.16b
+ st1 {v4.4s}, [x26]
+.return_gb:
+ ldp d14, d15, [sp], #0x10
+ ldp d12, d13, [sp], #0x10
+ ldp d10, d11, [sp], #0x10
+ ldp d8, d9, [sp], #0x10
+ ldp x29, x30, [sp], #0x10
+ ldp x27, x28, [sp], #0x10
+ ldp x25, x26, [sp], #0x10
+ ldp x23, x24, [sp], #0x10
+ ldp x21, x22, [sp], #0x10
+ ldp x19, x20, [sp], #0x10
+ ldp x17, x18, [sp], #0x10
+ ldp x15, x16, [sp], #0x10
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size vpsm4_xts_encrypt_gb,.-vpsm4_xts_encrypt_gb
+.globl vpsm4_xts_encrypt
+.type vpsm4_xts_encrypt,%function
+.align 5
+vpsm4_xts_encrypt:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x15, x16, [sp, #-0x10]!
+ stp x17, x18, [sp, #-0x10]!
+ stp x19, x20, [sp, #-0x10]!
+ stp x21, x22, [sp, #-0x10]!
+ stp x23, x24, [sp, #-0x10]!
+ stp x25, x26, [sp, #-0x10]!
+ stp x27, x28, [sp, #-0x10]!
+ stp x29, x30, [sp, #-0x10]!
+ stp d8, d9, [sp, #-0x10]!
+ stp d10, d11, [sp, #-0x10]!
+ stp d12, d13, [sp, #-0x10]!
+ stp d14, d15, [sp, #-0x10]!
+ mov x26,x3
+ mov x27,x4
+ mov w28,w6
+ ld1 {v8.4s}, [x5]
+ mov x3,x27
+ adrp x10,.Lsbox
+ add x10,x10,#:lo12:.Lsbox
+ ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x10],#64
+ ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x10],#64
+ ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x10],#64
+ ld1 {v28.16b,v29.16b,v30.16b,v31.16b},[x10]
+#ifndef __AARCH64EB__
+ rev32 v8.16b,v8.16b
+#endif
+ mov x10,x3
+ mov w11,#8
+ mov w12,v8.s[0]
+ mov w13,v8.s[1]
+ mov w14,v8.s[2]
+ mov w15,v8.s[3]
+10:
+ ldp w7,w8,[x10],8
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ eor w6,w14,w15
+ eor w9,w7,w13
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w12,w12,w6
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ eor w6,w14,w15
+ eor w9,w12,w8
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ ldp w7,w8,[x10],8
+ eor w13,w13,w6
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ eor w6,w12,w13
+ eor w9,w7,w15
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w14,w14,w6
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ eor w6,w12,w13
+ eor w9,w14,w8
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w15,w15,w6
+ subs w11,w11,#1
+ b.ne 10b
+ mov v8.s[0],w15
+ mov v8.s[1],w14
+ mov v8.s[2],w13
+ mov v8.s[3],w12
+#ifndef __AARCH64EB__
+ rev32 v8.16b,v8.16b
+#endif
+ mov x3,x26
+ and x29,x2,#0x0F
+ // convert length into blocks
+ lsr x2,x2,4
+ cmp x2,#1
+ b.lt .return
+
+ cmp x29,0
+ // If the encryption/decryption Length is N times of 16,
+ // the all blocks are encrypted/decrypted in .xts_encrypt_blocks
+ b.eq .xts_encrypt_blocks
+
+ // If the encryption/decryption length is not N times of 16,
+ // the last two blocks are encrypted/decrypted in .last_2blks_tweak or .only_2blks_tweak
+ // the other blocks are encrypted/decrypted in .xts_encrypt_blocks
+ subs x2,x2,#1
+ b.eq .only_2blks_tweak
+.xts_encrypt_blocks:
+#ifdef __AARCH64EB__
+ rev32 v8.16b,v8.16b
+#endif
+ mov x12,v8.d[0]
+ mov x13,v8.d[1]
+ mov w7,0x87
+ extr x9,x13,x13,#32
+ extr x15,x13,x12,#63
+ and w8,w7,w9,asr#31
+ eor x14,x8,x12,lsl#1
+ mov w7,0x87
+ extr x9,x15,x15,#32
+ extr x17,x15,x14,#63
+ and w8,w7,w9,asr#31
+ eor x16,x8,x14,lsl#1
+ mov w7,0x87
+ extr x9,x17,x17,#32
+ extr x19,x17,x16,#63
+ and w8,w7,w9,asr#31
+ eor x18,x8,x16,lsl#1
+ mov w7,0x87
+ extr x9,x19,x19,#32
+ extr x21,x19,x18,#63
+ and w8,w7,w9,asr#31
+ eor x20,x8,x18,lsl#1
+ mov w7,0x87
+ extr x9,x21,x21,#32
+ extr x23,x21,x20,#63
+ and w8,w7,w9,asr#31
+ eor x22,x8,x20,lsl#1
+ mov w7,0x87
+ extr x9,x23,x23,#32
+ extr x25,x23,x22,#63
+ and w8,w7,w9,asr#31
+ eor x24,x8,x22,lsl#1
+ mov w7,0x87
+ extr x9,x25,x25,#32
+ extr x27,x25,x24,#63
+ and w8,w7,w9,asr#31
+ eor x26,x8,x24,lsl#1
+.Lxts_8_blocks_process:
+ cmp x2,#8
+ b.lt .Lxts_4_blocks_process
+ mov v0.d[0],x12
+ mov v0.d[1],x13
+#ifdef __AARCH64EB__
+ rev32 v0.16b,v0.16b
+#endif
+ mov v1.d[0],x14
+ mov v1.d[1],x15
+#ifdef __AARCH64EB__
+ rev32 v1.16b,v1.16b
+#endif
+ mov v2.d[0],x16
+ mov v2.d[1],x17
+#ifdef __AARCH64EB__
+ rev32 v2.16b,v2.16b
+#endif
+ mov v3.d[0],x18
+ mov v3.d[1],x19
+#ifdef __AARCH64EB__
+ rev32 v3.16b,v3.16b
+#endif
+ mov v12.d[0],x20
+ mov v12.d[1],x21
+#ifdef __AARCH64EB__
+ rev32 v12.16b,v12.16b
+#endif
+ mov v13.d[0],x22
+ mov v13.d[1],x23
+#ifdef __AARCH64EB__
+ rev32 v13.16b,v13.16b
+#endif
+ mov v14.d[0],x24
+ mov v14.d[1],x25
+#ifdef __AARCH64EB__
+ rev32 v14.16b,v14.16b
+#endif
+ mov v15.d[0],x26
+ mov v15.d[1],x27
+#ifdef __AARCH64EB__
+ rev32 v15.16b,v15.16b
+#endif
+ ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
+ eor v4.16b, v4.16b, v0.16b
+ eor v5.16b, v5.16b, v1.16b
+ eor v6.16b, v6.16b, v2.16b
+ eor v7.16b, v7.16b, v3.16b
+ ld1 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64
+ eor v8.16b, v8.16b, v12.16b
+ eor v9.16b, v9.16b, v13.16b
+ eor v10.16b, v10.16b, v14.16b
+ eor v11.16b, v11.16b, v15.16b
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v5.16b,v5.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v6.16b,v6.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v7.16b,v7.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v8.16b,v8.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v9.16b,v9.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v10.16b,v10.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v11.16b,v11.16b
+#endif
+ zip1 v0.4s,v4.4s,v5.4s
+ zip2 v1.4s,v4.4s,v5.4s
+ zip1 v2.4s,v6.4s,v7.4s
+ zip2 v3.4s,v6.4s,v7.4s
+ zip1 v4.2d,v0.2d,v2.2d
+ zip2 v5.2d,v0.2d,v2.2d
+ zip1 v6.2d,v1.2d,v3.2d
+ zip2 v7.2d,v1.2d,v3.2d
+ zip1 v0.4s,v8.4s,v9.4s
+ zip2 v1.4s,v8.4s,v9.4s
+ zip1 v2.4s,v10.4s,v11.4s
+ zip2 v3.4s,v10.4s,v11.4s
+ zip1 v8.2d,v0.2d,v2.2d
+ zip2 v9.2d,v0.2d,v2.2d
+ zip1 v10.2d,v1.2d,v3.2d
+ zip2 v11.2d,v1.2d,v3.2d
+ bl _vpsm4_enc_8blks
+ zip1 v8.4s,v0.4s,v1.4s
+ zip2 v9.4s,v0.4s,v1.4s
+ zip1 v10.4s,v2.4s,v3.4s
+ zip2 v11.4s,v2.4s,v3.4s
+ zip1 v0.2d,v8.2d,v10.2d
+ zip2 v1.2d,v8.2d,v10.2d
+ zip1 v2.2d,v9.2d,v11.2d
+ zip2 v3.2d,v9.2d,v11.2d
+ zip1 v8.4s,v4.4s,v5.4s
+ zip2 v9.4s,v4.4s,v5.4s
+ zip1 v10.4s,v6.4s,v7.4s
+ zip2 v11.4s,v6.4s,v7.4s
+ zip1 v4.2d,v8.2d,v10.2d
+ zip2 v5.2d,v8.2d,v10.2d
+ zip1 v6.2d,v9.2d,v11.2d
+ zip2 v7.2d,v9.2d,v11.2d
+ mov v12.d[0],x12
+ mov v12.d[1],x13
+#ifdef __AARCH64EB__
+ rev32 v12.16b,v12.16b
+#endif
+ mov w7,0x87
+ extr x9,x27,x27,#32
+ extr x13,x27,x26,#63
+ and w8,w7,w9,asr#31
+ eor x12,x8,x26,lsl#1
+ mov v13.d[0],x14
+ mov v13.d[1],x15
+#ifdef __AARCH64EB__
+ rev32 v13.16b,v13.16b
+#endif
+ mov w7,0x87
+ extr x9,x13,x13,#32
+ extr x15,x13,x12,#63
+ and w8,w7,w9,asr#31
+ eor x14,x8,x12,lsl#1
+ mov v14.d[0],x16
+ mov v14.d[1],x17
+#ifdef __AARCH64EB__
+ rev32 v14.16b,v14.16b
+#endif
+ mov w7,0x87
+ extr x9,x15,x15,#32
+ extr x17,x15,x14,#63
+ and w8,w7,w9,asr#31
+ eor x16,x8,x14,lsl#1
+ mov v15.d[0],x18
+ mov v15.d[1],x19
+#ifdef __AARCH64EB__
+ rev32 v15.16b,v15.16b
+#endif
+ mov w7,0x87
+ extr x9,x17,x17,#32
+ extr x19,x17,x16,#63
+ and w8,w7,w9,asr#31
+ eor x18,x8,x16,lsl#1
+ mov v8.d[0],x20
+ mov v8.d[1],x21
+#ifdef __AARCH64EB__
+ rev32 v8.16b,v8.16b
+#endif
+ mov w7,0x87
+ extr x9,x19,x19,#32
+ extr x21,x19,x18,#63
+ and w8,w7,w9,asr#31
+ eor x20,x8,x18,lsl#1
+ mov v9.d[0],x22
+ mov v9.d[1],x23
+#ifdef __AARCH64EB__
+ rev32 v9.16b,v9.16b
+#endif
+ mov w7,0x87
+ extr x9,x21,x21,#32
+ extr x23,x21,x20,#63
+ and w8,w7,w9,asr#31
+ eor x22,x8,x20,lsl#1
+ mov v10.d[0],x24
+ mov v10.d[1],x25
+#ifdef __AARCH64EB__
+ rev32 v10.16b,v10.16b
+#endif
+ mov w7,0x87
+ extr x9,x23,x23,#32
+ extr x25,x23,x22,#63
+ and w8,w7,w9,asr#31
+ eor x24,x8,x22,lsl#1
+ mov v11.d[0],x26
+ mov v11.d[1],x27
+#ifdef __AARCH64EB__
+ rev32 v11.16b,v11.16b
+#endif
+ mov w7,0x87
+ extr x9,x25,x25,#32
+ extr x27,x25,x24,#63
+ and w8,w7,w9,asr#31
+ eor x26,x8,x24,lsl#1
+ eor v0.16b, v0.16b, v12.16b
+ eor v1.16b, v1.16b, v13.16b
+ eor v2.16b, v2.16b, v14.16b
+ eor v3.16b, v3.16b, v15.16b
+ eor v4.16b, v4.16b, v8.16b
+ eor v5.16b, v5.16b, v9.16b
+ eor v6.16b, v6.16b, v10.16b
+ eor v7.16b, v7.16b, v11.16b
+
+ // save the last tweak
+ st1 {v11.4s},[x5]
+ st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
+ st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64
+ subs x2,x2,#8
+ b.gt .Lxts_8_blocks_process
+ b 100f
+.Lxts_4_blocks_process:
+ mov v8.d[0],x12
+ mov v8.d[1],x13
+#ifdef __AARCH64EB__
+ rev32 v8.16b,v8.16b
+#endif
+ mov v9.d[0],x14
+ mov v9.d[1],x15
+#ifdef __AARCH64EB__
+ rev32 v9.16b,v9.16b
+#endif
+ mov v10.d[0],x16
+ mov v10.d[1],x17
+#ifdef __AARCH64EB__
+ rev32 v10.16b,v10.16b
+#endif
+ mov v11.d[0],x18
+ mov v11.d[1],x19
+#ifdef __AARCH64EB__
+ rev32 v11.16b,v11.16b
+#endif
+ cmp x2,#4
+ b.lt 1f
+ ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
+ eor v4.16b, v4.16b, v8.16b
+ eor v5.16b, v5.16b, v9.16b
+ eor v6.16b, v6.16b, v10.16b
+ eor v7.16b, v7.16b, v11.16b
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v5.16b,v5.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v6.16b,v6.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v7.16b,v7.16b
+#endif
+ zip1 v0.4s,v4.4s,v5.4s
+ zip2 v1.4s,v4.4s,v5.4s
+ zip1 v2.4s,v6.4s,v7.4s
+ zip2 v3.4s,v6.4s,v7.4s
+ zip1 v4.2d,v0.2d,v2.2d
+ zip2 v5.2d,v0.2d,v2.2d
+ zip1 v6.2d,v1.2d,v3.2d
+ zip2 v7.2d,v1.2d,v3.2d
+ bl _vpsm4_enc_4blks
+ zip1 v4.4s,v0.4s,v1.4s
+ zip2 v5.4s,v0.4s,v1.4s
+ zip1 v6.4s,v2.4s,v3.4s
+ zip2 v7.4s,v2.4s,v3.4s
+ zip1 v0.2d,v4.2d,v6.2d
+ zip2 v1.2d,v4.2d,v6.2d
+ zip1 v2.2d,v5.2d,v7.2d
+ zip2 v3.2d,v5.2d,v7.2d
+ eor v0.16b, v0.16b, v8.16b
+ eor v1.16b, v1.16b, v9.16b
+ eor v2.16b, v2.16b, v10.16b
+ eor v3.16b, v3.16b, v11.16b
+ st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
+ sub x2,x2,#4
+ mov v8.d[0],x20
+ mov v8.d[1],x21
+#ifdef __AARCH64EB__
+ rev32 v8.16b,v8.16b
+#endif
+ mov v9.d[0],x22
+ mov v9.d[1],x23
+#ifdef __AARCH64EB__
+ rev32 v9.16b,v9.16b
+#endif
+ mov v10.d[0],x24
+ mov v10.d[1],x25
+#ifdef __AARCH64EB__
+ rev32 v10.16b,v10.16b
+#endif
+ // save the last tweak
+ st1 {v11.4s},[x5]
+1:
+ // process last block
+ cmp x2,#1
+ b.lt 100f
+ b.gt 1f
+ ld1 {v4.4s},[x0],#16
+ eor v4.16b, v4.16b, v8.16b
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+ mov x10,x3
+ mov w11,#8
+ mov w12,v4.s[0]
+ mov w13,v4.s[1]
+ mov w14,v4.s[2]
+ mov w15,v4.s[3]
+10:
+ ldp w7,w8,[x10],8
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ eor w6,w14,w15
+ eor w9,w7,w13
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w12,w12,w6
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ eor w6,w14,w15
+ eor w9,w12,w8
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ ldp w7,w8,[x10],8
+ eor w13,w13,w6
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ eor w6,w12,w13
+ eor w9,w7,w15
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w14,w14,w6
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ eor w6,w12,w13
+ eor w9,w14,w8
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w15,w15,w6
+ subs w11,w11,#1
+ b.ne 10b
+ mov v4.s[0],w15
+ mov v4.s[1],w14
+ mov v4.s[2],w13
+ mov v4.s[3],w12
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+ eor v4.16b, v4.16b, v8.16b
+ st1 {v4.4s},[x1],#16
+ // save the last tweak
+ st1 {v8.4s},[x5]
+ b 100f
+1: // process last 2 blocks
+ cmp x2,#2
+ b.gt 1f
+ ld1 {v4.4s,v5.4s},[x0],#32
+ eor v4.16b, v4.16b, v8.16b
+ eor v5.16b, v5.16b, v9.16b
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v5.16b,v5.16b
+#endif
+ zip1 v0.4s,v4.4s,v5.4s
+ zip2 v1.4s,v4.4s,v5.4s
+ zip1 v2.4s,v6.4s,v7.4s
+ zip2 v3.4s,v6.4s,v7.4s
+ zip1 v4.2d,v0.2d,v2.2d
+ zip2 v5.2d,v0.2d,v2.2d
+ zip1 v6.2d,v1.2d,v3.2d
+ zip2 v7.2d,v1.2d,v3.2d
+ bl _vpsm4_enc_4blks
+ zip1 v4.4s,v0.4s,v1.4s
+ zip2 v5.4s,v0.4s,v1.4s
+ zip1 v6.4s,v2.4s,v3.4s
+ zip2 v7.4s,v2.4s,v3.4s
+ zip1 v0.2d,v4.2d,v6.2d
+ zip2 v1.2d,v4.2d,v6.2d
+ zip1 v2.2d,v5.2d,v7.2d
+ zip2 v3.2d,v5.2d,v7.2d
+ eor v0.16b, v0.16b, v8.16b
+ eor v1.16b, v1.16b, v9.16b
+ st1 {v0.4s,v1.4s},[x1],#32
+ // save the last tweak
+ st1 {v9.4s},[x5]
+ b 100f
+1: // process last 3 blocks
+ ld1 {v4.4s,v5.4s,v6.4s},[x0],#48
+ eor v4.16b, v4.16b, v8.16b
+ eor v5.16b, v5.16b, v9.16b
+ eor v6.16b, v6.16b, v10.16b
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v5.16b,v5.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v6.16b,v6.16b
+#endif
+ zip1 v0.4s,v4.4s,v5.4s
+ zip2 v1.4s,v4.4s,v5.4s
+ zip1 v2.4s,v6.4s,v7.4s
+ zip2 v3.4s,v6.4s,v7.4s
+ zip1 v4.2d,v0.2d,v2.2d
+ zip2 v5.2d,v0.2d,v2.2d
+ zip1 v6.2d,v1.2d,v3.2d
+ zip2 v7.2d,v1.2d,v3.2d
+ bl _vpsm4_enc_4blks
+ zip1 v4.4s,v0.4s,v1.4s
+ zip2 v5.4s,v0.4s,v1.4s
+ zip1 v6.4s,v2.4s,v3.4s
+ zip2 v7.4s,v2.4s,v3.4s
+ zip1 v0.2d,v4.2d,v6.2d
+ zip2 v1.2d,v4.2d,v6.2d
+ zip1 v2.2d,v5.2d,v7.2d
+ zip2 v3.2d,v5.2d,v7.2d
+ eor v0.16b, v0.16b, v8.16b
+ eor v1.16b, v1.16b, v9.16b
+ eor v2.16b, v2.16b, v10.16b
+ st1 {v0.4s,v1.4s,v2.4s},[x1],#48
+ // save the last tweak
+ st1 {v10.4s},[x5]
+100:
+ cmp x29,0
+ b.eq .return
+
+// This branch calculates the last two tweaks,
+// while the encryption/decryption length is larger than 32
+.last_2blks_tweak:
+ ld1 {v8.4s},[x5]
+#ifdef __AARCH64EB__
+ rev32 v8.16b,v8.16b
+#endif
+ mov v2.16b,v8.16b
+ adrp x10,.Lxts_magic
+ ldr q0, [x10, #:lo12:.Lxts_magic]
+ shl v9.16b, v2.16b, #1
+ ext v1.16b, v2.16b, v2.16b,#15
+ ushr v1.16b, v1.16b, #7
+ mul v1.16b, v1.16b, v0.16b
+ eor v9.16b, v9.16b, v1.16b
+ mov v2.16b,v9.16b
+ adrp x10,.Lxts_magic
+ ldr q0, [x10, #:lo12:.Lxts_magic]
+ shl v10.16b, v2.16b, #1
+ ext v1.16b, v2.16b, v2.16b,#15
+ ushr v1.16b, v1.16b, #7
+ mul v1.16b, v1.16b, v0.16b
+ eor v10.16b, v10.16b, v1.16b
+ b .check_dec
+
+
+// This branch calculates the last two tweaks,
+// while the encryption/decryption length is equal to 32, who only need two tweaks
+.only_2blks_tweak:
+ mov v9.16b,v8.16b
+#ifdef __AARCH64EB__
+ rev32 v9.16b,v9.16b
+#endif
+ mov v2.16b,v9.16b
+ adrp x10,.Lxts_magic
+ ldr q0, [x10, #:lo12:.Lxts_magic]
+ shl v10.16b, v2.16b, #1
+ ext v1.16b, v2.16b, v2.16b,#15
+ ushr v1.16b, v1.16b, #7
+ mul v1.16b, v1.16b, v0.16b
+ eor v10.16b, v10.16b, v1.16b
+ b .check_dec
+
+
+// Determine whether encryption or decryption is required.
+// The last two tweaks need to be swapped for decryption.
+.check_dec:
+ // encryption:1 decryption:0
+ cmp w28,1
+ b.eq .process_last_2blks
+ mov v0.16B,v9.16b
+ mov v9.16B,v10.16b
+ mov v10.16B,v0.16b
+
+.process_last_2blks:
+#ifdef __AARCH64EB__
+ rev32 v9.16b,v9.16b
+#endif
+#ifdef __AARCH64EB__
+ rev32 v10.16b,v10.16b
+#endif
+ ld1 {v4.4s},[x0],#16
+ eor v4.16b, v4.16b, v9.16b
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+ mov x10,x3
+ mov w11,#8
+ mov w12,v4.s[0]
+ mov w13,v4.s[1]
+ mov w14,v4.s[2]
+ mov w15,v4.s[3]
+10:
+ ldp w7,w8,[x10],8
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ eor w6,w14,w15
+ eor w9,w7,w13
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w12,w12,w6
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ eor w6,w14,w15
+ eor w9,w12,w8
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ ldp w7,w8,[x10],8
+ eor w13,w13,w6
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ eor w6,w12,w13
+ eor w9,w7,w15
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w14,w14,w6
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ eor w6,w12,w13
+ eor w9,w14,w8
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w15,w15,w6
+ subs w11,w11,#1
+ b.ne 10b
+ mov v4.s[0],w15
+ mov v4.s[1],w14
+ mov v4.s[2],w13
+ mov v4.s[3],w12
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+ eor v4.16b, v4.16b, v9.16b
+ st1 {v4.4s},[x1],#16
+
+ sub x26,x1,16
+.loop:
+ subs x29,x29,1
+ ldrb w7,[x26,x29]
+ ldrb w8,[x0,x29]
+ strb w8,[x26,x29]
+ strb w7,[x1,x29]
+ b.gt .loop
+ ld1 {v4.4s}, [x26]
+ eor v4.16b, v4.16b, v10.16b
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+ mov x10,x3
+ mov w11,#8
+ mov w12,v4.s[0]
+ mov w13,v4.s[1]
+ mov w14,v4.s[2]
+ mov w15,v4.s[3]
+10:
+ ldp w7,w8,[x10],8
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ eor w6,w14,w15
+ eor w9,w7,w13
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w12,w12,w6
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ eor w6,w14,w15
+ eor w9,w12,w8
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ ldp w7,w8,[x10],8
+ eor w13,w13,w6
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ eor w6,w12,w13
+ eor w9,w7,w15
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w14,w14,w6
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ eor w6,w12,w13
+ eor w9,w14,w8
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w15,w15,w6
+ subs w11,w11,#1
+ b.ne 10b
+ mov v4.s[0],w15
+ mov v4.s[1],w14
+ mov v4.s[2],w13
+ mov v4.s[3],w12
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+ eor v4.16b, v4.16b, v10.16b
+ st1 {v4.4s}, [x26]
+.return:
+ ldp d14, d15, [sp], #0x10
+ ldp d12, d13, [sp], #0x10
+ ldp d10, d11, [sp], #0x10
+ ldp d8, d9, [sp], #0x10
+ ldp x29, x30, [sp], #0x10
+ ldp x27, x28, [sp], #0x10
+ ldp x25, x26, [sp], #0x10
+ ldp x23, x24, [sp], #0x10
+ ldp x21, x22, [sp], #0x10
+ ldp x19, x20, [sp], #0x10
+ ldp x17, x18, [sp], #0x10
+ ldp x15, x16, [sp], #0x10
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size vpsm4_xts_encrypt,.-vpsm4_xts_encrypt
diff --git a/sys/crypto/openssl/aarch64/vpsm4_ex-armv8.S b/sys/crypto/openssl/aarch64/vpsm4_ex-armv8.S
new file mode 100644
index 000000000000..5627d6d1c6b4
--- /dev/null
+++ b/sys/crypto/openssl/aarch64/vpsm4_ex-armv8.S
@@ -0,0 +1,4523 @@
+/* Do not modify. This file is auto-generated from vpsm4_ex-armv8.pl. */
+// Copyright 2022-2025 The OpenSSL Project Authors. All Rights Reserved.
+//
+// Licensed under the Apache License 2.0 (the "License"). You may not use
+// this file except in compliance with the License. You can obtain a copy
+// in the file LICENSE in the source distribution or at
+// https://www.openssl.org/source/license.html
+
+//
+// This module implements SM4 with ASIMD and AESE on AARCH64
+//
+// Dec 2022
+//
+
+// $output is the last argument if it looks like a file (it has an extension)
+// $flavour is the first argument if it doesn't look like a file
+#include "arm_arch.h"
+.arch armv8-a+crypto
+.text
+
+.type _vpsm4_ex_consts,%object
+.align 7
+_vpsm4_ex_consts:
+.Lck:
+.long 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269
+.long 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9
+.long 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249
+.long 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9
+.long 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229
+.long 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299
+.long 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209
+.long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279
+.Lfk:
+.quad 0x56aa3350a3b1bac6,0xb27022dc677d9197
+.Lshuffles:
+.quad 0x0B0A090807060504,0x030201000F0E0D0C
+.Lxts_magic:
+.quad 0x0101010101010187,0x0101010101010101
+.Lsbox_magic:
+.quad 0x0b0e0104070a0d00,0x0306090c0f020508
+.quad 0x62185a2042387a00,0x22581a6002783a40
+.quad 0x15df62a89e54e923,0xc10bb67c4a803df7
+.quad 0xb9aa6b78c1d21300,0x1407c6d56c7fbead
+.quad 0x6404462679195b3b,0xe383c1a1fe9edcbc
+.quad 0x0f0f0f0f0f0f0f0f,0x0f0f0f0f0f0f0f0f
+
+.size _vpsm4_ex_consts,.-_vpsm4_ex_consts
+.type _vpsm4_ex_set_key,%function
+.align 4
+_vpsm4_ex_set_key:
+ AARCH64_VALID_CALL_TARGET
+ ld1 {v5.4s},[x0]
+ adrp x9, .Lsbox_magic
+ ldr q26, [x9, #:lo12:.Lsbox_magic]
+ ldr q27, [x9, #:lo12:.Lsbox_magic+16]
+ ldr q28, [x9, #:lo12:.Lsbox_magic+32]
+ ldr q29, [x9, #:lo12:.Lsbox_magic+48]
+ ldr q30, [x9, #:lo12:.Lsbox_magic+64]
+ ldr q31, [x9, #:lo12:.Lsbox_magic+80]
+#ifndef __AARCH64EB__
+ rev32 v5.16b,v5.16b
+#endif
+ adrp x5,.Lshuffles
+ add x5,x5,#:lo12:.Lshuffles
+ ld1 {v7.2d},[x5]
+ adrp x5,.Lfk
+ add x5,x5,#:lo12:.Lfk
+ ld1 {v6.2d},[x5]
+ eor v5.16b,v5.16b,v6.16b
+ mov x6,#32
+ adrp x5,.Lck
+ add x5,x5,#:lo12:.Lck
+ movi v0.16b,#64
+ cbnz w2,1f
+ add x1,x1,124
+1:
+ mov w7,v5.s[1]
+ ldr w8,[x5],#4
+ eor w8,w8,w7
+ mov w7,v5.s[2]
+ eor w8,w8,w7
+ mov w7,v5.s[3]
+ eor w8,w8,w7
+ // optimize sbox using AESE instruction
+ mov v4.s[0],w8
+ tbl v0.16b, {v4.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ mov w7,v0.s[0]
+ eor w8,w7,w7,ror #19
+ eor w8,w8,w7,ror #9
+ mov w7,v5.s[0]
+ eor w8,w8,w7
+ mov v5.s[0],w8
+ cbz w2,2f
+ str w8,[x1],#4
+ b 3f
+2:
+ str w8,[x1],#-4
+3:
+ tbl v5.16b,{v5.16b},v7.16b
+ subs x6,x6,#1
+ b.ne 1b
+ ret
+.size _vpsm4_ex_set_key,.-_vpsm4_ex_set_key
+.type _vpsm4_ex_enc_4blks,%function
+.align 4
+_vpsm4_ex_enc_4blks:
+ AARCH64_VALID_CALL_TARGET
+ mov x10,x3
+ mov w11,#8
+10:
+ ldp w7,w8,[x10],8
+ dup v12.4s,w7
+ dup v13.4s,w8
+
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ eor v14.16b,v6.16b,v7.16b
+ eor v12.16b,v5.16b,v12.16b
+ eor v12.16b,v14.16b,v12.16b
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v12.16b}, v26.16b
+ ushr v24.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v24.16b, {v27.16b}, v24.16b
+ eor v0.16b, v0.16b, v24.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v24.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v24.16b, {v29.16b}, v24.16b
+ eor v0.16b, v0.16b, v24.16b
+ mov v12.16b,v0.16b
+
+ // linear transformation
+ ushr v0.4s,v12.4s,32-2
+ ushr v1.4s,v12.4s,32-10
+ ushr v2.4s,v12.4s,32-18
+ ushr v3.4s,v12.4s,32-24
+ sli v0.4s,v12.4s,2
+ sli v1.4s,v12.4s,10
+ sli v2.4s,v12.4s,18
+ sli v3.4s,v12.4s,24
+ eor v24.16b,v0.16b,v12.16b
+ eor v24.16b,v24.16b,v1.16b
+ eor v12.16b,v2.16b,v3.16b
+ eor v12.16b,v12.16b,v24.16b
+ eor v4.16b,v4.16b,v12.16b
+
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ eor v14.16b,v14.16b,v4.16b
+ eor v13.16b,v14.16b,v13.16b
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v13.16b}, v26.16b
+ ushr v24.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v24.16b, {v27.16b}, v24.16b
+ eor v0.16b, v0.16b, v24.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v24.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v24.16b, {v29.16b}, v24.16b
+ eor v0.16b, v0.16b, v24.16b
+ mov v13.16b,v0.16b
+
+ // linear transformation
+ ushr v0.4s,v13.4s,32-2
+ ushr v1.4s,v13.4s,32-10
+ ushr v2.4s,v13.4s,32-18
+ ushr v3.4s,v13.4s,32-24
+ sli v0.4s,v13.4s,2
+ sli v1.4s,v13.4s,10
+ sli v2.4s,v13.4s,18
+ sli v3.4s,v13.4s,24
+ eor v24.16b,v0.16b,v13.16b
+ eor v24.16b,v24.16b,v1.16b
+ eor v13.16b,v2.16b,v3.16b
+ eor v13.16b,v13.16b,v24.16b
+ ldp w7,w8,[x10],8
+ eor v5.16b,v5.16b,v13.16b
+
+ dup v12.4s,w7
+ dup v13.4s,w8
+
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ eor v14.16b,v4.16b,v5.16b
+ eor v12.16b,v7.16b,v12.16b
+ eor v12.16b,v14.16b,v12.16b
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v12.16b}, v26.16b
+ ushr v24.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v24.16b, {v27.16b}, v24.16b
+ eor v0.16b, v0.16b, v24.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v24.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v24.16b, {v29.16b}, v24.16b
+ eor v0.16b, v0.16b, v24.16b
+ mov v12.16b,v0.16b
+
+ // linear transformation
+ ushr v0.4s,v12.4s,32-2
+ ushr v1.4s,v12.4s,32-10
+ ushr v2.4s,v12.4s,32-18
+ ushr v3.4s,v12.4s,32-24
+ sli v0.4s,v12.4s,2
+ sli v1.4s,v12.4s,10
+ sli v2.4s,v12.4s,18
+ sli v3.4s,v12.4s,24
+ eor v24.16b,v0.16b,v12.16b
+ eor v24.16b,v24.16b,v1.16b
+ eor v12.16b,v2.16b,v3.16b
+ eor v12.16b,v12.16b,v24.16b
+ eor v6.16b,v6.16b,v12.16b
+
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ eor v14.16b,v14.16b,v6.16b
+ eor v13.16b,v14.16b,v13.16b
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v13.16b}, v26.16b
+ ushr v24.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v24.16b, {v27.16b}, v24.16b
+ eor v0.16b, v0.16b, v24.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v24.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v24.16b, {v29.16b}, v24.16b
+ eor v0.16b, v0.16b, v24.16b
+ mov v13.16b,v0.16b
+
+ // linear transformation
+ ushr v0.4s,v13.4s,32-2
+ ushr v1.4s,v13.4s,32-10
+ ushr v2.4s,v13.4s,32-18
+ ushr v3.4s,v13.4s,32-24
+ sli v0.4s,v13.4s,2
+ sli v1.4s,v13.4s,10
+ sli v2.4s,v13.4s,18
+ sli v3.4s,v13.4s,24
+ eor v24.16b,v0.16b,v13.16b
+ eor v24.16b,v24.16b,v1.16b
+ eor v13.16b,v2.16b,v3.16b
+ eor v13.16b,v13.16b,v24.16b
+ eor v7.16b,v7.16b,v13.16b
+ subs w11,w11,#1
+ b.ne 10b
+#ifndef __AARCH64EB__
+ rev32 v3.16b,v4.16b
+#else
+ mov v3.16b,v4.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v2.16b,v5.16b
+#else
+ mov v2.16b,v5.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v1.16b,v6.16b
+#else
+ mov v1.16b,v6.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v0.16b,v7.16b
+#else
+ mov v0.16b,v7.16b
+#endif
+ ret
+.size _vpsm4_ex_enc_4blks,.-_vpsm4_ex_enc_4blks
+.type _vpsm4_ex_enc_8blks,%function
+.align 4
+_vpsm4_ex_enc_8blks:
+ AARCH64_VALID_CALL_TARGET
+ mov x10,x3
+ mov w11,#8
+10:
+ ldp w7,w8,[x10],8
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ dup v12.4s,w7
+ eor v14.16b,v6.16b,v7.16b
+ eor v15.16b,v10.16b,v11.16b
+ eor v0.16b,v5.16b,v12.16b
+ eor v1.16b,v9.16b,v12.16b
+ eor v12.16b,v14.16b,v0.16b
+ eor v13.16b,v15.16b,v1.16b
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v12.16b}, v26.16b
+ tbl v1.16b, {v13.16b}, v26.16b
+ ushr v24.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v24.16b, {v27.16b}, v24.16b
+ eor v0.16b, v0.16b, v24.16b
+ ushr v24.16b, v1.16b, 4
+ and v1.16b, v1.16b, v31.16b
+ tbl v1.16b, {v28.16b}, v1.16b
+ tbl v24.16b, {v27.16b}, v24.16b
+ eor v1.16b, v1.16b, v24.16b
+ eor v25.16b, v25.16b, v25.16b
+ aese v0.16b,v25.16b
+ aese v1.16b,v25.16b
+ ushr v24.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v24.16b, {v29.16b}, v24.16b
+ eor v0.16b, v0.16b, v24.16b
+ ushr v24.16b, v1.16b, 4
+ and v1.16b, v1.16b, v31.16b
+ tbl v1.16b, {v30.16b}, v1.16b
+ tbl v24.16b, {v29.16b}, v24.16b
+ eor v1.16b, v1.16b, v24.16b
+ mov v12.16b,v0.16b
+ mov v13.16b,v1.16b
+
+ // linear transformation
+ ushr v0.4s,v12.4s,32-2
+ ushr v25.4s,v13.4s,32-2
+ ushr v1.4s,v12.4s,32-10
+ ushr v2.4s,v12.4s,32-18
+ ushr v3.4s,v12.4s,32-24
+ sli v0.4s,v12.4s,2
+ sli v25.4s,v13.4s,2
+ sli v1.4s,v12.4s,10
+ sli v2.4s,v12.4s,18
+ sli v3.4s,v12.4s,24
+ eor v24.16b,v0.16b,v12.16b
+ eor v24.16b,v24.16b,v1.16b
+ eor v12.16b,v2.16b,v3.16b
+ eor v12.16b,v12.16b,v24.16b
+ ushr v1.4s,v13.4s,32-10
+ ushr v2.4s,v13.4s,32-18
+ ushr v3.4s,v13.4s,32-24
+ sli v1.4s,v13.4s,10
+ sli v2.4s,v13.4s,18
+ sli v3.4s,v13.4s,24
+ eor v24.16b,v25.16b,v13.16b
+ eor v24.16b,v24.16b,v1.16b
+ eor v13.16b,v2.16b,v3.16b
+ eor v13.16b,v13.16b,v24.16b
+ eor v4.16b,v4.16b,v12.16b
+ eor v8.16b,v8.16b,v13.16b
+
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ dup v13.4s,w8
+ eor v14.16b,v14.16b,v4.16b
+ eor v15.16b,v15.16b,v8.16b
+ eor v12.16b,v14.16b,v13.16b
+ eor v13.16b,v15.16b,v13.16b
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v12.16b}, v26.16b
+ tbl v1.16b, {v13.16b}, v26.16b
+ ushr v24.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v24.16b, {v27.16b}, v24.16b
+ eor v0.16b, v0.16b, v24.16b
+ ushr v24.16b, v1.16b, 4
+ and v1.16b, v1.16b, v31.16b
+ tbl v1.16b, {v28.16b}, v1.16b
+ tbl v24.16b, {v27.16b}, v24.16b
+ eor v1.16b, v1.16b, v24.16b
+ eor v25.16b, v25.16b, v25.16b
+ aese v0.16b,v25.16b
+ aese v1.16b,v25.16b
+ ushr v24.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v24.16b, {v29.16b}, v24.16b
+ eor v0.16b, v0.16b, v24.16b
+ ushr v24.16b, v1.16b, 4
+ and v1.16b, v1.16b, v31.16b
+ tbl v1.16b, {v30.16b}, v1.16b
+ tbl v24.16b, {v29.16b}, v24.16b
+ eor v1.16b, v1.16b, v24.16b
+ mov v12.16b,v0.16b
+ mov v13.16b,v1.16b
+
+ // linear transformation
+ ushr v0.4s,v12.4s,32-2
+ ushr v25.4s,v13.4s,32-2
+ ushr v1.4s,v12.4s,32-10
+ ushr v2.4s,v12.4s,32-18
+ ushr v3.4s,v12.4s,32-24
+ sli v0.4s,v12.4s,2
+ sli v25.4s,v13.4s,2
+ sli v1.4s,v12.4s,10
+ sli v2.4s,v12.4s,18
+ sli v3.4s,v12.4s,24
+ eor v24.16b,v0.16b,v12.16b
+ eor v24.16b,v24.16b,v1.16b
+ eor v12.16b,v2.16b,v3.16b
+ eor v12.16b,v12.16b,v24.16b
+ ushr v1.4s,v13.4s,32-10
+ ushr v2.4s,v13.4s,32-18
+ ushr v3.4s,v13.4s,32-24
+ sli v1.4s,v13.4s,10
+ sli v2.4s,v13.4s,18
+ sli v3.4s,v13.4s,24
+ eor v24.16b,v25.16b,v13.16b
+ eor v24.16b,v24.16b,v1.16b
+ eor v13.16b,v2.16b,v3.16b
+ eor v13.16b,v13.16b,v24.16b
+ ldp w7,w8,[x10],8
+ eor v5.16b,v5.16b,v12.16b
+ eor v9.16b,v9.16b,v13.16b
+
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ dup v12.4s,w7
+ eor v14.16b,v4.16b,v5.16b
+ eor v15.16b,v8.16b,v9.16b
+ eor v0.16b,v7.16b,v12.16b
+ eor v1.16b,v11.16b,v12.16b
+ eor v12.16b,v14.16b,v0.16b
+ eor v13.16b,v15.16b,v1.16b
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v12.16b}, v26.16b
+ tbl v1.16b, {v13.16b}, v26.16b
+ ushr v24.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v24.16b, {v27.16b}, v24.16b
+ eor v0.16b, v0.16b, v24.16b
+ ushr v24.16b, v1.16b, 4
+ and v1.16b, v1.16b, v31.16b
+ tbl v1.16b, {v28.16b}, v1.16b
+ tbl v24.16b, {v27.16b}, v24.16b
+ eor v1.16b, v1.16b, v24.16b
+ eor v25.16b, v25.16b, v25.16b
+ aese v0.16b,v25.16b
+ aese v1.16b,v25.16b
+ ushr v24.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v24.16b, {v29.16b}, v24.16b
+ eor v0.16b, v0.16b, v24.16b
+ ushr v24.16b, v1.16b, 4
+ and v1.16b, v1.16b, v31.16b
+ tbl v1.16b, {v30.16b}, v1.16b
+ tbl v24.16b, {v29.16b}, v24.16b
+ eor v1.16b, v1.16b, v24.16b
+ mov v12.16b,v0.16b
+ mov v13.16b,v1.16b
+
+ // linear transformation
+ ushr v0.4s,v12.4s,32-2
+ ushr v25.4s,v13.4s,32-2
+ ushr v1.4s,v12.4s,32-10
+ ushr v2.4s,v12.4s,32-18
+ ushr v3.4s,v12.4s,32-24
+ sli v0.4s,v12.4s,2
+ sli v25.4s,v13.4s,2
+ sli v1.4s,v12.4s,10
+ sli v2.4s,v12.4s,18
+ sli v3.4s,v12.4s,24
+ eor v24.16b,v0.16b,v12.16b
+ eor v24.16b,v24.16b,v1.16b
+ eor v12.16b,v2.16b,v3.16b
+ eor v12.16b,v12.16b,v24.16b
+ ushr v1.4s,v13.4s,32-10
+ ushr v2.4s,v13.4s,32-18
+ ushr v3.4s,v13.4s,32-24
+ sli v1.4s,v13.4s,10
+ sli v2.4s,v13.4s,18
+ sli v3.4s,v13.4s,24
+ eor v24.16b,v25.16b,v13.16b
+ eor v24.16b,v24.16b,v1.16b
+ eor v13.16b,v2.16b,v3.16b
+ eor v13.16b,v13.16b,v24.16b
+ eor v6.16b,v6.16b,v12.16b
+ eor v10.16b,v10.16b,v13.16b
+
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ dup v13.4s,w8
+ eor v14.16b,v14.16b,v6.16b
+ eor v15.16b,v15.16b,v10.16b
+ eor v12.16b,v14.16b,v13.16b
+ eor v13.16b,v15.16b,v13.16b
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v12.16b}, v26.16b
+ tbl v1.16b, {v13.16b}, v26.16b
+ ushr v24.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v24.16b, {v27.16b}, v24.16b
+ eor v0.16b, v0.16b, v24.16b
+ ushr v24.16b, v1.16b, 4
+ and v1.16b, v1.16b, v31.16b
+ tbl v1.16b, {v28.16b}, v1.16b
+ tbl v24.16b, {v27.16b}, v24.16b
+ eor v1.16b, v1.16b, v24.16b
+ eor v25.16b, v25.16b, v25.16b
+ aese v0.16b,v25.16b
+ aese v1.16b,v25.16b
+ ushr v24.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v24.16b, {v29.16b}, v24.16b
+ eor v0.16b, v0.16b, v24.16b
+ ushr v24.16b, v1.16b, 4
+ and v1.16b, v1.16b, v31.16b
+ tbl v1.16b, {v30.16b}, v1.16b
+ tbl v24.16b, {v29.16b}, v24.16b
+ eor v1.16b, v1.16b, v24.16b
+ mov v12.16b,v0.16b
+ mov v13.16b,v1.16b
+
+ // linear transformation
+ ushr v0.4s,v12.4s,32-2
+ ushr v25.4s,v13.4s,32-2
+ ushr v1.4s,v12.4s,32-10
+ ushr v2.4s,v12.4s,32-18
+ ushr v3.4s,v12.4s,32-24
+ sli v0.4s,v12.4s,2
+ sli v25.4s,v13.4s,2
+ sli v1.4s,v12.4s,10
+ sli v2.4s,v12.4s,18
+ sli v3.4s,v12.4s,24
+ eor v24.16b,v0.16b,v12.16b
+ eor v24.16b,v24.16b,v1.16b
+ eor v12.16b,v2.16b,v3.16b
+ eor v12.16b,v12.16b,v24.16b
+ ushr v1.4s,v13.4s,32-10
+ ushr v2.4s,v13.4s,32-18
+ ushr v3.4s,v13.4s,32-24
+ sli v1.4s,v13.4s,10
+ sli v2.4s,v13.4s,18
+ sli v3.4s,v13.4s,24
+ eor v24.16b,v25.16b,v13.16b
+ eor v24.16b,v24.16b,v1.16b
+ eor v13.16b,v2.16b,v3.16b
+ eor v13.16b,v13.16b,v24.16b
+ eor v7.16b,v7.16b,v12.16b
+ eor v11.16b,v11.16b,v13.16b
+ subs w11,w11,#1
+ b.ne 10b
+#ifndef __AARCH64EB__
+ rev32 v3.16b,v4.16b
+#else
+ mov v3.16b,v4.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v2.16b,v5.16b
+#else
+ mov v2.16b,v5.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v1.16b,v6.16b
+#else
+ mov v1.16b,v6.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v0.16b,v7.16b
+#else
+ mov v0.16b,v7.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v7.16b,v8.16b
+#else
+ mov v7.16b,v8.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v6.16b,v9.16b
+#else
+ mov v6.16b,v9.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v5.16b,v10.16b
+#else
+ mov v5.16b,v10.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v11.16b
+#else
+ mov v4.16b,v11.16b
+#endif
+ ret
+.size _vpsm4_ex_enc_8blks,.-_vpsm4_ex_enc_8blks
+.globl vpsm4_ex_set_encrypt_key
+.type vpsm4_ex_set_encrypt_key,%function
+.align 5
+vpsm4_ex_set_encrypt_key:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-16]!
+ mov w2,1
+ bl _vpsm4_ex_set_key
+ ldp x29,x30,[sp],#16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size vpsm4_ex_set_encrypt_key,.-vpsm4_ex_set_encrypt_key
+.globl vpsm4_ex_set_decrypt_key
+.type vpsm4_ex_set_decrypt_key,%function
+.align 5
+vpsm4_ex_set_decrypt_key:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-16]!
+ mov w2,0
+ bl _vpsm4_ex_set_key
+ ldp x29,x30,[sp],#16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size vpsm4_ex_set_decrypt_key,.-vpsm4_ex_set_decrypt_key
+.globl vpsm4_ex_encrypt
+.type vpsm4_ex_encrypt,%function
+.align 5
+vpsm4_ex_encrypt:
+ AARCH64_VALID_CALL_TARGET
+ ld1 {v4.4s},[x0]
+ adrp x9, .Lsbox_magic
+ ldr q26, [x9, #:lo12:.Lsbox_magic]
+ ldr q27, [x9, #:lo12:.Lsbox_magic+16]
+ ldr q28, [x9, #:lo12:.Lsbox_magic+32]
+ ldr q29, [x9, #:lo12:.Lsbox_magic+48]
+ ldr q30, [x9, #:lo12:.Lsbox_magic+64]
+ ldr q31, [x9, #:lo12:.Lsbox_magic+80]
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+ mov x3,x2
+ mov x10,x3
+ mov w11,#8
+ mov w12,v4.s[0]
+ mov w13,v4.s[1]
+ mov w14,v4.s[2]
+ mov w15,v4.s[3]
+10:
+ ldp w7,w8,[x10],8
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ eor w6,w14,w15
+ eor w9,w7,w13
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w12,w12,w6
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ eor w6,w14,w15
+ eor w9,w12,w8
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ ldp w7,w8,[x10],8
+ eor w13,w13,w6
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ eor w6,w12,w13
+ eor w9,w7,w15
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w14,w14,w6
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ eor w6,w12,w13
+ eor w9,w14,w8
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w15,w15,w6
+ subs w11,w11,#1
+ b.ne 10b
+ mov v4.s[0],w15
+ mov v4.s[1],w14
+ mov v4.s[2],w13
+ mov v4.s[3],w12
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+ st1 {v4.4s},[x1]
+ ret
+.size vpsm4_ex_encrypt,.-vpsm4_ex_encrypt
+.globl vpsm4_ex_decrypt
+.type vpsm4_ex_decrypt,%function
+.align 5
+vpsm4_ex_decrypt:
+ AARCH64_VALID_CALL_TARGET
+ ld1 {v4.4s},[x0]
+ adrp x9, .Lsbox_magic
+ ldr q26, [x9, #:lo12:.Lsbox_magic]
+ ldr q27, [x9, #:lo12:.Lsbox_magic+16]
+ ldr q28, [x9, #:lo12:.Lsbox_magic+32]
+ ldr q29, [x9, #:lo12:.Lsbox_magic+48]
+ ldr q30, [x9, #:lo12:.Lsbox_magic+64]
+ ldr q31, [x9, #:lo12:.Lsbox_magic+80]
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+ mov x3,x2
+ mov x10,x3
+ mov w11,#8
+ mov w12,v4.s[0]
+ mov w13,v4.s[1]
+ mov w14,v4.s[2]
+ mov w15,v4.s[3]
+10:
+ ldp w7,w8,[x10],8
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ eor w6,w14,w15
+ eor w9,w7,w13
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w12,w12,w6
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ eor w6,w14,w15
+ eor w9,w12,w8
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ ldp w7,w8,[x10],8
+ eor w13,w13,w6
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ eor w6,w12,w13
+ eor w9,w7,w15
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w14,w14,w6
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ eor w6,w12,w13
+ eor w9,w14,w8
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w15,w15,w6
+ subs w11,w11,#1
+ b.ne 10b
+ mov v4.s[0],w15
+ mov v4.s[1],w14
+ mov v4.s[2],w13
+ mov v4.s[3],w12
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+ st1 {v4.4s},[x1]
+ ret
+.size vpsm4_ex_decrypt,.-vpsm4_ex_decrypt
+.globl vpsm4_ex_ecb_encrypt
+.type vpsm4_ex_ecb_encrypt,%function
+.align 5
+vpsm4_ex_ecb_encrypt:
+ AARCH64_SIGN_LINK_REGISTER
+ // convert length into blocks
+ lsr x2,x2,4
+ stp d8,d9,[sp,#-80]!
+ stp d10,d11,[sp,#16]
+ stp d12,d13,[sp,#32]
+ stp d14,d15,[sp,#48]
+ stp x29,x30,[sp,#64]
+ adrp x9, .Lsbox_magic
+ ldr q26, [x9, #:lo12:.Lsbox_magic]
+ ldr q27, [x9, #:lo12:.Lsbox_magic+16]
+ ldr q28, [x9, #:lo12:.Lsbox_magic+32]
+ ldr q29, [x9, #:lo12:.Lsbox_magic+48]
+ ldr q30, [x9, #:lo12:.Lsbox_magic+64]
+ ldr q31, [x9, #:lo12:.Lsbox_magic+80]
+.Lecb_8_blocks_process:
+ cmp w2,#8
+ b.lt .Lecb_4_blocks_process
+ ld4 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
+ ld4 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v5.16b,v5.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v6.16b,v6.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v7.16b,v7.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v8.16b,v8.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v9.16b,v9.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v10.16b,v10.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v11.16b,v11.16b
+#endif
+ bl _vpsm4_ex_enc_8blks
+ st4 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
+ st4 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64
+ subs w2,w2,#8
+ b.gt .Lecb_8_blocks_process
+ b 100f
+.Lecb_4_blocks_process:
+ cmp w2,#4
+ b.lt 1f
+ ld4 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v5.16b,v5.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v6.16b,v6.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v7.16b,v7.16b
+#endif
+ bl _vpsm4_ex_enc_4blks
+ st4 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
+ sub w2,w2,#4
+1:
+ // process last block
+ cmp w2,#1
+ b.lt 100f
+ b.gt 1f
+ ld1 {v4.4s},[x0]
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+ mov x10,x3
+ mov w11,#8
+ mov w12,v4.s[0]
+ mov w13,v4.s[1]
+ mov w14,v4.s[2]
+ mov w15,v4.s[3]
+10:
+ ldp w7,w8,[x10],8
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ eor w6,w14,w15
+ eor w9,w7,w13
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w12,w12,w6
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ eor w6,w14,w15
+ eor w9,w12,w8
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ ldp w7,w8,[x10],8
+ eor w13,w13,w6
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ eor w6,w12,w13
+ eor w9,w7,w15
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w14,w14,w6
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ eor w6,w12,w13
+ eor w9,w14,w8
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w15,w15,w6
+ subs w11,w11,#1
+ b.ne 10b
+ mov v4.s[0],w15
+ mov v4.s[1],w14
+ mov v4.s[2],w13
+ mov v4.s[3],w12
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+ st1 {v4.4s},[x1]
+ b 100f
+1: // process last 2 blocks
+ ld4 {v4.s,v5.s,v6.s,v7.s}[0],[x0],#16
+ ld4 {v4.s,v5.s,v6.s,v7.s}[1],[x0],#16
+ cmp w2,#2
+ b.gt 1f
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v5.16b,v5.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v6.16b,v6.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v7.16b,v7.16b
+#endif
+ bl _vpsm4_ex_enc_4blks
+ st4 {v0.s,v1.s,v2.s,v3.s}[0],[x1],#16
+ st4 {v0.s,v1.s,v2.s,v3.s}[1],[x1]
+ b 100f
+1: // process last 3 blocks
+ ld4 {v4.s,v5.s,v6.s,v7.s}[2],[x0],#16
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v5.16b,v5.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v6.16b,v6.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v7.16b,v7.16b
+#endif
+ bl _vpsm4_ex_enc_4blks
+ st4 {v0.s,v1.s,v2.s,v3.s}[0],[x1],#16
+ st4 {v0.s,v1.s,v2.s,v3.s}[1],[x1],#16
+ st4 {v0.s,v1.s,v2.s,v3.s}[2],[x1]
+100:
+ ldp d10,d11,[sp,#16]
+ ldp d12,d13,[sp,#32]
+ ldp d14,d15,[sp,#48]
+ ldp x29,x30,[sp,#64]
+ ldp d8,d9,[sp],#80
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size vpsm4_ex_ecb_encrypt,.-vpsm4_ex_ecb_encrypt
+.globl vpsm4_ex_cbc_encrypt
+.type vpsm4_ex_cbc_encrypt,%function
+.align 5
+vpsm4_ex_cbc_encrypt:
+ AARCH64_VALID_CALL_TARGET
+ lsr x2,x2,4
+ adrp x9, .Lsbox_magic
+ ldr q26, [x9, #:lo12:.Lsbox_magic]
+ ldr q27, [x9, #:lo12:.Lsbox_magic+16]
+ ldr q28, [x9, #:lo12:.Lsbox_magic+32]
+ ldr q29, [x9, #:lo12:.Lsbox_magic+48]
+ ldr q30, [x9, #:lo12:.Lsbox_magic+64]
+ ldr q31, [x9, #:lo12:.Lsbox_magic+80]
+ cbz w5,.Ldec
+ ld1 {v3.4s},[x4]
+.Lcbc_4_blocks_enc:
+ cmp w2,#4
+ b.lt 1f
+ ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
+ eor v4.16b,v4.16b,v3.16b
+#ifndef __AARCH64EB__
+ rev32 v5.16b,v5.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v6.16b,v6.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v7.16b,v7.16b
+#endif
+ mov x10,x3
+ mov w11,#8
+ mov w12,v4.s[0]
+ mov w13,v4.s[1]
+ mov w14,v4.s[2]
+ mov w15,v4.s[3]
+10:
+ ldp w7,w8,[x10],8
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ eor w6,w14,w15
+ eor w9,w7,w13
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w12,w12,w6
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ eor w6,w14,w15
+ eor w9,w12,w8
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ ldp w7,w8,[x10],8
+ eor w13,w13,w6
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ eor w6,w12,w13
+ eor w9,w7,w15
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w14,w14,w6
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ eor w6,w12,w13
+ eor w9,w14,w8
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w15,w15,w6
+ subs w11,w11,#1
+ b.ne 10b
+ mov v4.s[0],w15
+ mov v4.s[1],w14
+ mov v4.s[2],w13
+ mov v4.s[3],w12
+ eor v5.16b,v5.16b,v4.16b
+ mov x10,x3
+ mov w11,#8
+ mov w12,v5.s[0]
+ mov w13,v5.s[1]
+ mov w14,v5.s[2]
+ mov w15,v5.s[3]
+10:
+ ldp w7,w8,[x10],8
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ eor w6,w14,w15
+ eor w9,w7,w13
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w12,w12,w6
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ eor w6,w14,w15
+ eor w9,w12,w8
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ ldp w7,w8,[x10],8
+ eor w13,w13,w6
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ eor w6,w12,w13
+ eor w9,w7,w15
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w14,w14,w6
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ eor w6,w12,w13
+ eor w9,w14,w8
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w15,w15,w6
+ subs w11,w11,#1
+ b.ne 10b
+ mov v5.s[0],w15
+ mov v5.s[1],w14
+ mov v5.s[2],w13
+ mov v5.s[3],w12
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+ eor v6.16b,v6.16b,v5.16b
+ mov x10,x3
+ mov w11,#8
+ mov w12,v6.s[0]
+ mov w13,v6.s[1]
+ mov w14,v6.s[2]
+ mov w15,v6.s[3]
+10:
+ ldp w7,w8,[x10],8
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ eor w6,w14,w15
+ eor w9,w7,w13
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w12,w12,w6
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ eor w6,w14,w15
+ eor w9,w12,w8
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ ldp w7,w8,[x10],8
+ eor w13,w13,w6
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ eor w6,w12,w13
+ eor w9,w7,w15
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w14,w14,w6
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ eor w6,w12,w13
+ eor w9,w14,w8
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w15,w15,w6
+ subs w11,w11,#1
+ b.ne 10b
+ mov v6.s[0],w15
+ mov v6.s[1],w14
+ mov v6.s[2],w13
+ mov v6.s[3],w12
+#ifndef __AARCH64EB__
+ rev32 v5.16b,v5.16b
+#endif
+ eor v7.16b,v7.16b,v6.16b
+ mov x10,x3
+ mov w11,#8
+ mov w12,v7.s[0]
+ mov w13,v7.s[1]
+ mov w14,v7.s[2]
+ mov w15,v7.s[3]
+10:
+ ldp w7,w8,[x10],8
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ eor w6,w14,w15
+ eor w9,w7,w13
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w12,w12,w6
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ eor w6,w14,w15
+ eor w9,w12,w8
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ ldp w7,w8,[x10],8
+ eor w13,w13,w6
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ eor w6,w12,w13
+ eor w9,w7,w15
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w14,w14,w6
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ eor w6,w12,w13
+ eor w9,w14,w8
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w15,w15,w6
+ subs w11,w11,#1
+ b.ne 10b
+ mov v7.s[0],w15
+ mov v7.s[1],w14
+ mov v7.s[2],w13
+ mov v7.s[3],w12
+#ifndef __AARCH64EB__
+ rev32 v6.16b,v6.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v7.16b,v7.16b
+#endif
+ orr v3.16b,v7.16b,v7.16b
+ st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64
+ subs w2,w2,#4
+ b.ne .Lcbc_4_blocks_enc
+ b 2f
+1:
+ subs w2,w2,#1
+ b.lt 2f
+ ld1 {v4.4s},[x0],#16
+ eor v3.16b,v3.16b,v4.16b
+#ifndef __AARCH64EB__
+ rev32 v3.16b,v3.16b
+#endif
+ mov x10,x3
+ mov w11,#8
+ mov w12,v3.s[0]
+ mov w13,v3.s[1]
+ mov w14,v3.s[2]
+ mov w15,v3.s[3]
+10:
+ ldp w7,w8,[x10],8
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ eor w6,w14,w15
+ eor w9,w7,w13
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w12,w12,w6
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ eor w6,w14,w15
+ eor w9,w12,w8
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ ldp w7,w8,[x10],8
+ eor w13,w13,w6
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ eor w6,w12,w13
+ eor w9,w7,w15
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w14,w14,w6
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ eor w6,w12,w13
+ eor w9,w14,w8
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w15,w15,w6
+ subs w11,w11,#1
+ b.ne 10b
+ mov v3.s[0],w15
+ mov v3.s[1],w14
+ mov v3.s[2],w13
+ mov v3.s[3],w12
+#ifndef __AARCH64EB__
+ rev32 v3.16b,v3.16b
+#endif
+ st1 {v3.4s},[x1],#16
+ b 1b
+2:
+ // save back IV
+ st1 {v3.4s},[x4]
+ ret
+
+.Ldec:
+ // decryption mode starts
+ AARCH64_SIGN_LINK_REGISTER
+ stp d8,d9,[sp,#-80]!
+ stp d10,d11,[sp,#16]
+ stp d12,d13,[sp,#32]
+ stp d14,d15,[sp,#48]
+ stp x29,x30,[sp,#64]
+.Lcbc_8_blocks_dec:
+ cmp w2,#8
+ b.lt 1f
+ ld4 {v4.4s,v5.4s,v6.4s,v7.4s},[x0]
+ add x10,x0,#64
+ ld4 {v8.4s,v9.4s,v10.4s,v11.4s},[x10]
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v5.16b,v5.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v6.16b,v6.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v7.16b,v7.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v8.16b,v8.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v9.16b,v9.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v10.16b,v10.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v11.16b,v11.16b
+#endif
+ bl _vpsm4_ex_enc_8blks
+ zip1 v8.4s,v0.4s,v1.4s
+ zip2 v9.4s,v0.4s,v1.4s
+ zip1 v10.4s,v2.4s,v3.4s
+ zip2 v11.4s,v2.4s,v3.4s
+ zip1 v0.2d,v8.2d,v10.2d
+ zip2 v1.2d,v8.2d,v10.2d
+ zip1 v2.2d,v9.2d,v11.2d
+ zip2 v3.2d,v9.2d,v11.2d
+ zip1 v8.4s,v4.4s,v5.4s
+ zip2 v9.4s,v4.4s,v5.4s
+ zip1 v10.4s,v6.4s,v7.4s
+ zip2 v11.4s,v6.4s,v7.4s
+ zip1 v4.2d,v8.2d,v10.2d
+ zip2 v5.2d,v8.2d,v10.2d
+ zip1 v6.2d,v9.2d,v11.2d
+ zip2 v7.2d,v9.2d,v11.2d
+ ld1 {v15.4s},[x4]
+ ld1 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64
+ // note ivec1 and vtmpx[3] are reusing the same register
+ // care needs to be taken to avoid conflict
+ eor v0.16b,v0.16b,v15.16b
+ ld1 {v12.4s,v13.4s,v14.4s,v15.4s},[x0],#64
+ eor v1.16b,v1.16b,v8.16b
+ eor v2.16b,v2.16b,v9.16b
+ eor v3.16b,v3.16b,v10.16b
+ // save back IV
+ st1 {v15.4s}, [x4]
+ eor v4.16b,v4.16b,v11.16b
+ eor v5.16b,v5.16b,v12.16b
+ eor v6.16b,v6.16b,v13.16b
+ eor v7.16b,v7.16b,v14.16b
+ st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
+ st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64
+ subs w2,w2,#8
+ b.gt .Lcbc_8_blocks_dec
+ b.eq 100f
+1:
+ ld1 {v15.4s},[x4]
+.Lcbc_4_blocks_dec:
+ cmp w2,#4
+ b.lt 1f
+ ld4 {v4.4s,v5.4s,v6.4s,v7.4s},[x0]
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v5.16b,v5.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v6.16b,v6.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v7.16b,v7.16b
+#endif
+ bl _vpsm4_ex_enc_4blks
+ ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
+ zip1 v8.4s,v0.4s,v1.4s
+ zip2 v9.4s,v0.4s,v1.4s
+ zip1 v10.4s,v2.4s,v3.4s
+ zip2 v11.4s,v2.4s,v3.4s
+ zip1 v0.2d,v8.2d,v10.2d
+ zip2 v1.2d,v8.2d,v10.2d
+ zip1 v2.2d,v9.2d,v11.2d
+ zip2 v3.2d,v9.2d,v11.2d
+ eor v0.16b,v0.16b,v15.16b
+ eor v1.16b,v1.16b,v4.16b
+ orr v15.16b,v7.16b,v7.16b
+ eor v2.16b,v2.16b,v5.16b
+ eor v3.16b,v3.16b,v6.16b
+ st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
+ subs w2,w2,#4
+ b.gt .Lcbc_4_blocks_dec
+ // save back IV
+ st1 {v7.4s}, [x4]
+ b 100f
+1: // last block
+ subs w2,w2,#1
+ b.lt 100f
+ b.gt 1f
+ ld1 {v4.4s},[x0],#16
+ // save back IV
+ st1 {v4.4s}, [x4]
+#ifndef __AARCH64EB__
+ rev32 v8.16b,v4.16b
+#else
+ mov v8.16b,v4.16b
+#endif
+ mov x10,x3
+ mov w11,#8
+ mov w12,v8.s[0]
+ mov w13,v8.s[1]
+ mov w14,v8.s[2]
+ mov w15,v8.s[3]
+10:
+ ldp w7,w8,[x10],8
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ eor w6,w14,w15
+ eor w9,w7,w13
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w12,w12,w6
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ eor w6,w14,w15
+ eor w9,w12,w8
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ ldp w7,w8,[x10],8
+ eor w13,w13,w6
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ eor w6,w12,w13
+ eor w9,w7,w15
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w14,w14,w6
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ eor w6,w12,w13
+ eor w9,w14,w8
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w15,w15,w6
+ subs w11,w11,#1
+ b.ne 10b
+ mov v8.s[0],w15
+ mov v8.s[1],w14
+ mov v8.s[2],w13
+ mov v8.s[3],w12
+#ifndef __AARCH64EB__
+ rev32 v8.16b,v8.16b
+#endif
+ eor v8.16b,v8.16b,v15.16b
+ st1 {v8.4s},[x1],#16
+ b 100f
+1: // last two blocks
+ ld4 {v4.s,v5.s,v6.s,v7.s}[0],[x0]
+ add x10,x0,#16
+ ld4 {v4.s,v5.s,v6.s,v7.s}[1],[x10],#16
+ subs w2,w2,1
+ b.gt 1f
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v5.16b,v5.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v6.16b,v6.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v7.16b,v7.16b
+#endif
+ bl _vpsm4_ex_enc_4blks
+ ld1 {v4.4s,v5.4s},[x0],#32
+ zip1 v8.4s,v0.4s,v1.4s
+ zip2 v9.4s,v0.4s,v1.4s
+ zip1 v10.4s,v2.4s,v3.4s
+ zip2 v11.4s,v2.4s,v3.4s
+ zip1 v0.2d,v8.2d,v10.2d
+ zip2 v1.2d,v8.2d,v10.2d
+ zip1 v2.2d,v9.2d,v11.2d
+ zip2 v3.2d,v9.2d,v11.2d
+ eor v0.16b,v0.16b,v15.16b
+ eor v1.16b,v1.16b,v4.16b
+ st1 {v0.4s,v1.4s},[x1],#32
+ // save back IV
+ st1 {v5.4s}, [x4]
+ b 100f
+1: // last 3 blocks
+ ld4 {v4.s,v5.s,v6.s,v7.s}[2],[x10]
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v5.16b,v5.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v6.16b,v6.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v7.16b,v7.16b
+#endif
+ bl _vpsm4_ex_enc_4blks
+ ld1 {v4.4s,v5.4s,v6.4s},[x0],#48
+ zip1 v8.4s,v0.4s,v1.4s
+ zip2 v9.4s,v0.4s,v1.4s
+ zip1 v10.4s,v2.4s,v3.4s
+ zip2 v11.4s,v2.4s,v3.4s
+ zip1 v0.2d,v8.2d,v10.2d
+ zip2 v1.2d,v8.2d,v10.2d
+ zip1 v2.2d,v9.2d,v11.2d
+ zip2 v3.2d,v9.2d,v11.2d
+ eor v0.16b,v0.16b,v15.16b
+ eor v1.16b,v1.16b,v4.16b
+ eor v2.16b,v2.16b,v5.16b
+ st1 {v0.4s,v1.4s,v2.4s},[x1],#48
+ // save back IV
+ st1 {v6.4s}, [x4]
+100:
+ ldp d10,d11,[sp,#16]
+ ldp d12,d13,[sp,#32]
+ ldp d14,d15,[sp,#48]
+ ldp x29,x30,[sp,#64]
+ ldp d8,d9,[sp],#80
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size vpsm4_ex_cbc_encrypt,.-vpsm4_ex_cbc_encrypt
+.globl vpsm4_ex_ctr32_encrypt_blocks
+.type vpsm4_ex_ctr32_encrypt_blocks,%function
+.align 5
+vpsm4_ex_ctr32_encrypt_blocks:
+ AARCH64_VALID_CALL_TARGET
+ ld1 {v3.4s},[x4]
+#ifndef __AARCH64EB__
+ rev32 v3.16b,v3.16b
+#endif
+ adrp x9, .Lsbox_magic
+ ldr q26, [x9, #:lo12:.Lsbox_magic]
+ ldr q27, [x9, #:lo12:.Lsbox_magic+16]
+ ldr q28, [x9, #:lo12:.Lsbox_magic+32]
+ ldr q29, [x9, #:lo12:.Lsbox_magic+48]
+ ldr q30, [x9, #:lo12:.Lsbox_magic+64]
+ ldr q31, [x9, #:lo12:.Lsbox_magic+80]
+ cmp w2,#1
+ b.ne 1f
+ // fast processing for one single block without
+ // context saving overhead
+ mov x10,x3
+ mov w11,#8
+ mov w12,v3.s[0]
+ mov w13,v3.s[1]
+ mov w14,v3.s[2]
+ mov w15,v3.s[3]
+10:
+ ldp w7,w8,[x10],8
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ eor w6,w14,w15
+ eor w9,w7,w13
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w12,w12,w6
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ eor w6,w14,w15
+ eor w9,w12,w8
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ ldp w7,w8,[x10],8
+ eor w13,w13,w6
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ eor w6,w12,w13
+ eor w9,w7,w15
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w14,w14,w6
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ eor w6,w12,w13
+ eor w9,w14,w8
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w15,w15,w6
+ subs w11,w11,#1
+ b.ne 10b
+ mov v3.s[0],w15
+ mov v3.s[1],w14
+ mov v3.s[2],w13
+ mov v3.s[3],w12
+#ifndef __AARCH64EB__
+ rev32 v3.16b,v3.16b
+#endif
+ ld1 {v4.4s},[x0]
+ eor v4.16b,v4.16b,v3.16b
+ st1 {v4.4s},[x1]
+ ret
+1:
+ AARCH64_SIGN_LINK_REGISTER
+ stp d8,d9,[sp,#-80]!
+ stp d10,d11,[sp,#16]
+ stp d12,d13,[sp,#32]
+ stp d14,d15,[sp,#48]
+ stp x29,x30,[sp,#64]
+ mov w12,v3.s[0]
+ mov w13,v3.s[1]
+ mov w14,v3.s[2]
+ mov w5,v3.s[3]
+.Lctr32_4_blocks_process:
+ cmp w2,#4
+ b.lt 1f
+ dup v4.4s,w12
+ dup v5.4s,w13
+ dup v6.4s,w14
+ mov v7.s[0],w5
+ add w5,w5,#1
+ mov v7.s[1],w5
+ add w5,w5,#1
+ mov v7.s[2],w5
+ add w5,w5,#1
+ mov v7.s[3],w5
+ add w5,w5,#1
+ cmp w2,#8
+ b.ge .Lctr32_8_blocks_process
+ bl _vpsm4_ex_enc_4blks
+ ld4 {v12.4s,v13.4s,v14.4s,v15.4s},[x0],#64
+ eor v0.16b,v0.16b,v12.16b
+ eor v1.16b,v1.16b,v13.16b
+ eor v2.16b,v2.16b,v14.16b
+ eor v3.16b,v3.16b,v15.16b
+ st4 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
+ subs w2,w2,#4
+ b.ne .Lctr32_4_blocks_process
+ b 100f
+.Lctr32_8_blocks_process:
+ dup v8.4s,w12
+ dup v9.4s,w13
+ dup v10.4s,w14
+ mov v11.s[0],w5
+ add w5,w5,#1
+ mov v11.s[1],w5
+ add w5,w5,#1
+ mov v11.s[2],w5
+ add w5,w5,#1
+ mov v11.s[3],w5
+ add w5,w5,#1
+ bl _vpsm4_ex_enc_8blks
+ ld4 {v12.4s,v13.4s,v14.4s,v15.4s},[x0],#64
+ ld4 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64
+ eor v0.16b,v0.16b,v12.16b
+ eor v1.16b,v1.16b,v13.16b
+ eor v2.16b,v2.16b,v14.16b
+ eor v3.16b,v3.16b,v15.16b
+ eor v4.16b,v4.16b,v8.16b
+ eor v5.16b,v5.16b,v9.16b
+ eor v6.16b,v6.16b,v10.16b
+ eor v7.16b,v7.16b,v11.16b
+ st4 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
+ st4 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64
+ subs w2,w2,#8
+ b.ne .Lctr32_4_blocks_process
+ b 100f
+1: // last block processing
+ subs w2,w2,#1
+ b.lt 100f
+ b.gt 1f
+ mov v3.s[0],w12
+ mov v3.s[1],w13
+ mov v3.s[2],w14
+ mov v3.s[3],w5
+ mov x10,x3
+ mov w11,#8
+ mov w12,v3.s[0]
+ mov w13,v3.s[1]
+ mov w14,v3.s[2]
+ mov w15,v3.s[3]
+10:
+ ldp w7,w8,[x10],8
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ eor w6,w14,w15
+ eor w9,w7,w13
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w12,w12,w6
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ eor w6,w14,w15
+ eor w9,w12,w8
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ ldp w7,w8,[x10],8
+ eor w13,w13,w6
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ eor w6,w12,w13
+ eor w9,w7,w15
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w14,w14,w6
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ eor w6,w12,w13
+ eor w9,w14,w8
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w15,w15,w6
+ subs w11,w11,#1
+ b.ne 10b
+ mov v3.s[0],w15
+ mov v3.s[1],w14
+ mov v3.s[2],w13
+ mov v3.s[3],w12
+#ifndef __AARCH64EB__
+ rev32 v3.16b,v3.16b
+#endif
+ ld1 {v4.4s},[x0]
+ eor v4.16b,v4.16b,v3.16b
+ st1 {v4.4s},[x1]
+ b 100f
+1: // last 2 blocks processing
+ dup v4.4s,w12
+ dup v5.4s,w13
+ dup v6.4s,w14
+ mov v7.s[0],w5
+ add w5,w5,#1
+ mov v7.s[1],w5
+ subs w2,w2,#1
+ b.ne 1f
+ bl _vpsm4_ex_enc_4blks
+ ld4 {v12.s,v13.s,v14.s,v15.s}[0],[x0],#16
+ ld4 {v12.s,v13.s,v14.s,v15.s}[1],[x0],#16
+ eor v0.16b,v0.16b,v12.16b
+ eor v1.16b,v1.16b,v13.16b
+ eor v2.16b,v2.16b,v14.16b
+ eor v3.16b,v3.16b,v15.16b
+ st4 {v0.s,v1.s,v2.s,v3.s}[0],[x1],#16
+ st4 {v0.s,v1.s,v2.s,v3.s}[1],[x1],#16
+ b 100f
+1: // last 3 blocks processing
+ add w5,w5,#1
+ mov v7.s[2],w5
+ bl _vpsm4_ex_enc_4blks
+ ld4 {v12.s,v13.s,v14.s,v15.s}[0],[x0],#16
+ ld4 {v12.s,v13.s,v14.s,v15.s}[1],[x0],#16
+ ld4 {v12.s,v13.s,v14.s,v15.s}[2],[x0],#16
+ eor v0.16b,v0.16b,v12.16b
+ eor v1.16b,v1.16b,v13.16b
+ eor v2.16b,v2.16b,v14.16b
+ eor v3.16b,v3.16b,v15.16b
+ st4 {v0.s,v1.s,v2.s,v3.s}[0],[x1],#16
+ st4 {v0.s,v1.s,v2.s,v3.s}[1],[x1],#16
+ st4 {v0.s,v1.s,v2.s,v3.s}[2],[x1],#16
+100:
+ ldp d10,d11,[sp,#16]
+ ldp d12,d13,[sp,#32]
+ ldp d14,d15,[sp,#48]
+ ldp x29,x30,[sp,#64]
+ ldp d8,d9,[sp],#80
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size vpsm4_ex_ctr32_encrypt_blocks,.-vpsm4_ex_ctr32_encrypt_blocks
+.globl vpsm4_ex_xts_encrypt_gb
+.type vpsm4_ex_xts_encrypt_gb,%function
+.align 5
+vpsm4_ex_xts_encrypt_gb:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x15, x16, [sp, #-0x10]!
+ stp x17, x18, [sp, #-0x10]!
+ stp x19, x20, [sp, #-0x10]!
+ stp x21, x22, [sp, #-0x10]!
+ stp x23, x24, [sp, #-0x10]!
+ stp x25, x26, [sp, #-0x10]!
+ stp x27, x28, [sp, #-0x10]!
+ stp x29, x30, [sp, #-0x10]!
+ stp d8, d9, [sp, #-0x10]!
+ stp d10, d11, [sp, #-0x10]!
+ stp d12, d13, [sp, #-0x10]!
+ stp d14, d15, [sp, #-0x10]!
+ mov x26,x3
+ mov x27,x4
+ mov w28,w6
+ ld1 {v16.4s}, [x5]
+ mov x3,x27
+ adrp x9, .Lsbox_magic
+ ldr q26, [x9, #:lo12:.Lsbox_magic]
+ ldr q27, [x9, #:lo12:.Lsbox_magic+16]
+ ldr q28, [x9, #:lo12:.Lsbox_magic+32]
+ ldr q29, [x9, #:lo12:.Lsbox_magic+48]
+ ldr q30, [x9, #:lo12:.Lsbox_magic+64]
+ ldr q31, [x9, #:lo12:.Lsbox_magic+80]
+#ifndef __AARCH64EB__
+ rev32 v16.16b,v16.16b
+#endif
+ mov x10,x3
+ mov w11,#8
+ mov w12,v16.s[0]
+ mov w13,v16.s[1]
+ mov w14,v16.s[2]
+ mov w15,v16.s[3]
+10:
+ ldp w7,w8,[x10],8
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ eor w6,w14,w15
+ eor w9,w7,w13
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w12,w12,w6
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ eor w6,w14,w15
+ eor w9,w12,w8
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ ldp w7,w8,[x10],8
+ eor w13,w13,w6
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ eor w6,w12,w13
+ eor w9,w7,w15
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w14,w14,w6
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ eor w6,w12,w13
+ eor w9,w14,w8
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w15,w15,w6
+ subs w11,w11,#1
+ b.ne 10b
+ mov v16.s[0],w15
+ mov v16.s[1],w14
+ mov v16.s[2],w13
+ mov v16.s[3],w12
+#ifndef __AARCH64EB__
+ rev32 v16.16b,v16.16b
+#endif
+ mov x3,x26
+ and x29,x2,#0x0F
+ // convert length into blocks
+ lsr x2,x2,4
+ cmp x2,#1
+ b.lt .return_gb
+
+ cmp x29,0
+ // If the encryption/decryption Length is N times of 16,
+ // the all blocks are encrypted/decrypted in .xts_encrypt_blocks_gb
+ b.eq .xts_encrypt_blocks_gb
+
+ // If the encryption/decryption length is not N times of 16,
+ // the last two blocks are encrypted/decrypted in .last_2blks_tweak_gb or .only_2blks_tweak_gb
+ // the other blocks are encrypted/decrypted in .xts_encrypt_blocks_gb
+ subs x2,x2,#1
+ b.eq .only_2blks_tweak_gb
+.xts_encrypt_blocks_gb:
+ rbit v16.16b,v16.16b
+#ifdef __AARCH64EB__
+ rev32 v16.16b,v16.16b
+#endif
+ mov x12,v16.d[0]
+ mov x13,v16.d[1]
+ mov w7,0x87
+ extr x9,x13,x13,#32
+ extr x15,x13,x12,#63
+ and w8,w7,w9,asr#31
+ eor x14,x8,x12,lsl#1
+ mov w7,0x87
+ extr x9,x15,x15,#32
+ extr x17,x15,x14,#63
+ and w8,w7,w9,asr#31
+ eor x16,x8,x14,lsl#1
+ mov w7,0x87
+ extr x9,x17,x17,#32
+ extr x19,x17,x16,#63
+ and w8,w7,w9,asr#31
+ eor x18,x8,x16,lsl#1
+ mov w7,0x87
+ extr x9,x19,x19,#32
+ extr x21,x19,x18,#63
+ and w8,w7,w9,asr#31
+ eor x20,x8,x18,lsl#1
+ mov w7,0x87
+ extr x9,x21,x21,#32
+ extr x23,x21,x20,#63
+ and w8,w7,w9,asr#31
+ eor x22,x8,x20,lsl#1
+ mov w7,0x87
+ extr x9,x23,x23,#32
+ extr x25,x23,x22,#63
+ and w8,w7,w9,asr#31
+ eor x24,x8,x22,lsl#1
+ mov w7,0x87
+ extr x9,x25,x25,#32
+ extr x27,x25,x24,#63
+ and w8,w7,w9,asr#31
+ eor x26,x8,x24,lsl#1
+.Lxts_8_blocks_process_gb:
+ cmp x2,#8
+ mov v16.d[0],x12
+ mov v16.d[1],x13
+#ifdef __AARCH64EB__
+ rev32 v16.16b,v16.16b
+#endif
+ mov w7,0x87
+ extr x9,x27,x27,#32
+ extr x13,x27,x26,#63
+ and w8,w7,w9,asr#31
+ eor x12,x8,x26,lsl#1
+ mov v17.d[0],x14
+ mov v17.d[1],x15
+#ifdef __AARCH64EB__
+ rev32 v17.16b,v17.16b
+#endif
+ mov w7,0x87
+ extr x9,x13,x13,#32
+ extr x15,x13,x12,#63
+ and w8,w7,w9,asr#31
+ eor x14,x8,x12,lsl#1
+ mov v18.d[0],x16
+ mov v18.d[1],x17
+#ifdef __AARCH64EB__
+ rev32 v18.16b,v18.16b
+#endif
+ mov w7,0x87
+ extr x9,x15,x15,#32
+ extr x17,x15,x14,#63
+ and w8,w7,w9,asr#31
+ eor x16,x8,x14,lsl#1
+ mov v19.d[0],x18
+ mov v19.d[1],x19
+#ifdef __AARCH64EB__
+ rev32 v19.16b,v19.16b
+#endif
+ mov w7,0x87
+ extr x9,x17,x17,#32
+ extr x19,x17,x16,#63
+ and w8,w7,w9,asr#31
+ eor x18,x8,x16,lsl#1
+ mov v20.d[0],x20
+ mov v20.d[1],x21
+#ifdef __AARCH64EB__
+ rev32 v20.16b,v20.16b
+#endif
+ mov w7,0x87
+ extr x9,x19,x19,#32
+ extr x21,x19,x18,#63
+ and w8,w7,w9,asr#31
+ eor x20,x8,x18,lsl#1
+ mov v21.d[0],x22
+ mov v21.d[1],x23
+#ifdef __AARCH64EB__
+ rev32 v21.16b,v21.16b
+#endif
+ mov w7,0x87
+ extr x9,x21,x21,#32
+ extr x23,x21,x20,#63
+ and w8,w7,w9,asr#31
+ eor x22,x8,x20,lsl#1
+ mov v22.d[0],x24
+ mov v22.d[1],x25
+#ifdef __AARCH64EB__
+ rev32 v22.16b,v22.16b
+#endif
+ mov w7,0x87
+ extr x9,x23,x23,#32
+ extr x25,x23,x22,#63
+ and w8,w7,w9,asr#31
+ eor x24,x8,x22,lsl#1
+ mov v23.d[0],x26
+ mov v23.d[1],x27
+#ifdef __AARCH64EB__
+ rev32 v23.16b,v23.16b
+#endif
+ mov w7,0x87
+ extr x9,x25,x25,#32
+ extr x27,x25,x24,#63
+ and w8,w7,w9,asr#31
+ eor x26,x8,x24,lsl#1
+ b.lt .Lxts_4_blocks_process_gb
+ ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
+ rbit v16.16b,v16.16b
+ rbit v17.16b,v17.16b
+ rbit v18.16b,v18.16b
+ rbit v19.16b,v19.16b
+ eor v4.16b, v4.16b, v16.16b
+ eor v5.16b, v5.16b, v17.16b
+ eor v6.16b, v6.16b, v18.16b
+ eor v7.16b, v7.16b, v19.16b
+ ld1 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64
+ rbit v20.16b,v20.16b
+ rbit v21.16b,v21.16b
+ rbit v22.16b,v22.16b
+ rbit v23.16b,v23.16b
+ eor v8.16b, v8.16b, v20.16b
+ eor v9.16b, v9.16b, v21.16b
+ eor v10.16b, v10.16b, v22.16b
+ eor v11.16b, v11.16b, v23.16b
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v5.16b,v5.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v6.16b,v6.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v7.16b,v7.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v8.16b,v8.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v9.16b,v9.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v10.16b,v10.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v11.16b,v11.16b
+#endif
+ zip1 v0.4s,v4.4s,v5.4s
+ zip2 v1.4s,v4.4s,v5.4s
+ zip1 v2.4s,v6.4s,v7.4s
+ zip2 v3.4s,v6.4s,v7.4s
+ zip1 v4.2d,v0.2d,v2.2d
+ zip2 v5.2d,v0.2d,v2.2d
+ zip1 v6.2d,v1.2d,v3.2d
+ zip2 v7.2d,v1.2d,v3.2d
+ zip1 v0.4s,v8.4s,v9.4s
+ zip2 v1.4s,v8.4s,v9.4s
+ zip1 v2.4s,v10.4s,v11.4s
+ zip2 v3.4s,v10.4s,v11.4s
+ zip1 v8.2d,v0.2d,v2.2d
+ zip2 v9.2d,v0.2d,v2.2d
+ zip1 v10.2d,v1.2d,v3.2d
+ zip2 v11.2d,v1.2d,v3.2d
+ bl _vpsm4_ex_enc_8blks
+ zip1 v8.4s,v0.4s,v1.4s
+ zip2 v9.4s,v0.4s,v1.4s
+ zip1 v10.4s,v2.4s,v3.4s
+ zip2 v11.4s,v2.4s,v3.4s
+ zip1 v0.2d,v8.2d,v10.2d
+ zip2 v1.2d,v8.2d,v10.2d
+ zip1 v2.2d,v9.2d,v11.2d
+ zip2 v3.2d,v9.2d,v11.2d
+ zip1 v8.4s,v4.4s,v5.4s
+ zip2 v9.4s,v4.4s,v5.4s
+ zip1 v10.4s,v6.4s,v7.4s
+ zip2 v11.4s,v6.4s,v7.4s
+ zip1 v4.2d,v8.2d,v10.2d
+ zip2 v5.2d,v8.2d,v10.2d
+ zip1 v6.2d,v9.2d,v11.2d
+ zip2 v7.2d,v9.2d,v11.2d
+ eor v0.16b, v0.16b, v16.16b
+ eor v1.16b, v1.16b, v17.16b
+ eor v2.16b, v2.16b, v18.16b
+ eor v3.16b, v3.16b, v19.16b
+ eor v4.16b, v4.16b, v20.16b
+ eor v5.16b, v5.16b, v21.16b
+ eor v6.16b, v6.16b, v22.16b
+ eor v7.16b, v7.16b, v23.16b
+
+ // save the last tweak
+ mov v25.16b,v23.16b
+ st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
+ st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64
+ subs x2,x2,#8
+ b.gt .Lxts_8_blocks_process_gb
+ b 100f
+.Lxts_4_blocks_process_gb:
+ cmp x2,#4
+ b.lt 1f
+ ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
+ rbit v16.16b,v16.16b
+ rbit v17.16b,v17.16b
+ rbit v18.16b,v18.16b
+ rbit v19.16b,v19.16b
+ eor v4.16b, v4.16b, v16.16b
+ eor v5.16b, v5.16b, v17.16b
+ eor v6.16b, v6.16b, v18.16b
+ eor v7.16b, v7.16b, v19.16b
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v5.16b,v5.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v6.16b,v6.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v7.16b,v7.16b
+#endif
+ zip1 v0.4s,v4.4s,v5.4s
+ zip2 v1.4s,v4.4s,v5.4s
+ zip1 v2.4s,v6.4s,v7.4s
+ zip2 v3.4s,v6.4s,v7.4s
+ zip1 v4.2d,v0.2d,v2.2d
+ zip2 v5.2d,v0.2d,v2.2d
+ zip1 v6.2d,v1.2d,v3.2d
+ zip2 v7.2d,v1.2d,v3.2d
+ bl _vpsm4_ex_enc_4blks
+ zip1 v4.4s,v0.4s,v1.4s
+ zip2 v5.4s,v0.4s,v1.4s
+ zip1 v6.4s,v2.4s,v3.4s
+ zip2 v7.4s,v2.4s,v3.4s
+ zip1 v0.2d,v4.2d,v6.2d
+ zip2 v1.2d,v4.2d,v6.2d
+ zip1 v2.2d,v5.2d,v7.2d
+ zip2 v3.2d,v5.2d,v7.2d
+ eor v0.16b, v0.16b, v16.16b
+ eor v1.16b, v1.16b, v17.16b
+ eor v2.16b, v2.16b, v18.16b
+ eor v3.16b, v3.16b, v19.16b
+ st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
+ sub x2,x2,#4
+ mov v16.16b,v20.16b
+ mov v17.16b,v21.16b
+ mov v18.16b,v22.16b
+ // save the last tweak
+ mov v25.16b,v19.16b
+1:
+ // process last block
+ cmp x2,#1
+ b.lt 100f
+ b.gt 1f
+ ld1 {v4.4s},[x0],#16
+ rbit v16.16b,v16.16b
+ eor v4.16b, v4.16b, v16.16b
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+ mov x10,x3
+ mov w11,#8
+ mov w12,v4.s[0]
+ mov w13,v4.s[1]
+ mov w14,v4.s[2]
+ mov w15,v4.s[3]
+10:
+ ldp w7,w8,[x10],8
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ eor w6,w14,w15
+ eor w9,w7,w13
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w12,w12,w6
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ eor w6,w14,w15
+ eor w9,w12,w8
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ ldp w7,w8,[x10],8
+ eor w13,w13,w6
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ eor w6,w12,w13
+ eor w9,w7,w15
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w14,w14,w6
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ eor w6,w12,w13
+ eor w9,w14,w8
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w15,w15,w6
+ subs w11,w11,#1
+ b.ne 10b
+ mov v4.s[0],w15
+ mov v4.s[1],w14
+ mov v4.s[2],w13
+ mov v4.s[3],w12
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+ eor v4.16b, v4.16b, v16.16b
+ st1 {v4.4s},[x1],#16
+ // save the last tweak
+ mov v25.16b,v16.16b
+ b 100f
+1: // process last 2 blocks
+ cmp x2,#2
+ b.gt 1f
+ ld1 {v4.4s,v5.4s},[x0],#32
+ rbit v16.16b,v16.16b
+ rbit v17.16b,v17.16b
+ eor v4.16b, v4.16b, v16.16b
+ eor v5.16b, v5.16b, v17.16b
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v5.16b,v5.16b
+#endif
+ zip1 v0.4s,v4.4s,v5.4s
+ zip2 v1.4s,v4.4s,v5.4s
+ zip1 v2.4s,v6.4s,v7.4s
+ zip2 v3.4s,v6.4s,v7.4s
+ zip1 v4.2d,v0.2d,v2.2d
+ zip2 v5.2d,v0.2d,v2.2d
+ zip1 v6.2d,v1.2d,v3.2d
+ zip2 v7.2d,v1.2d,v3.2d
+ bl _vpsm4_ex_enc_4blks
+ zip1 v4.4s,v0.4s,v1.4s
+ zip2 v5.4s,v0.4s,v1.4s
+ zip1 v6.4s,v2.4s,v3.4s
+ zip2 v7.4s,v2.4s,v3.4s
+ zip1 v0.2d,v4.2d,v6.2d
+ zip2 v1.2d,v4.2d,v6.2d
+ zip1 v2.2d,v5.2d,v7.2d
+ zip2 v3.2d,v5.2d,v7.2d
+ eor v0.16b, v0.16b, v16.16b
+ eor v1.16b, v1.16b, v17.16b
+ st1 {v0.4s,v1.4s},[x1],#32
+ // save the last tweak
+ mov v25.16b,v17.16b
+ b 100f
+1: // process last 3 blocks
+ ld1 {v4.4s,v5.4s,v6.4s},[x0],#48
+ rbit v16.16b,v16.16b
+ rbit v17.16b,v17.16b
+ rbit v18.16b,v18.16b
+ eor v4.16b, v4.16b, v16.16b
+ eor v5.16b, v5.16b, v17.16b
+ eor v6.16b, v6.16b, v18.16b
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v5.16b,v5.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v6.16b,v6.16b
+#endif
+ zip1 v0.4s,v4.4s,v5.4s
+ zip2 v1.4s,v4.4s,v5.4s
+ zip1 v2.4s,v6.4s,v7.4s
+ zip2 v3.4s,v6.4s,v7.4s
+ zip1 v4.2d,v0.2d,v2.2d
+ zip2 v5.2d,v0.2d,v2.2d
+ zip1 v6.2d,v1.2d,v3.2d
+ zip2 v7.2d,v1.2d,v3.2d
+ bl _vpsm4_ex_enc_4blks
+ zip1 v4.4s,v0.4s,v1.4s
+ zip2 v5.4s,v0.4s,v1.4s
+ zip1 v6.4s,v2.4s,v3.4s
+ zip2 v7.4s,v2.4s,v3.4s
+ zip1 v0.2d,v4.2d,v6.2d
+ zip2 v1.2d,v4.2d,v6.2d
+ zip1 v2.2d,v5.2d,v7.2d
+ zip2 v3.2d,v5.2d,v7.2d
+ eor v0.16b, v0.16b, v16.16b
+ eor v1.16b, v1.16b, v17.16b
+ eor v2.16b, v2.16b, v18.16b
+ st1 {v0.4s,v1.4s,v2.4s},[x1],#48
+ // save the last tweak
+ mov v25.16b,v18.16b
+100:
+ cmp x29,0
+ b.eq .return_gb
+
+// This branch calculates the last two tweaks,
+// while the encryption/decryption length is larger than 32
+.last_2blks_tweak_gb:
+#ifdef __AARCH64EB__
+ rev32 v25.16b,v25.16b
+#endif
+ rbit v2.16b,v25.16b
+ adrp x9, .Lxts_magic
+ ldr q0, [x9, #:lo12:.Lxts_magic]
+ shl v17.16b, v2.16b, #1
+ ext v1.16b, v2.16b, v2.16b,#15
+ ushr v1.16b, v1.16b, #7
+ mul v1.16b, v1.16b, v0.16b
+ eor v17.16b, v17.16b, v1.16b
+ rbit v17.16b,v17.16b
+ rbit v2.16b,v17.16b
+ adrp x9, .Lxts_magic
+ ldr q0, [x9, #:lo12:.Lxts_magic]
+ shl v18.16b, v2.16b, #1
+ ext v1.16b, v2.16b, v2.16b,#15
+ ushr v1.16b, v1.16b, #7
+ mul v1.16b, v1.16b, v0.16b
+ eor v18.16b, v18.16b, v1.16b
+ rbit v18.16b,v18.16b
+ b .check_dec_gb
+
+
+// This branch calculates the last two tweaks,
+// while the encryption/decryption length is equal to 32, who only need two tweaks
+.only_2blks_tweak_gb:
+ mov v17.16b,v16.16b
+#ifdef __AARCH64EB__
+ rev32 v17.16b,v17.16b
+#endif
+ rbit v2.16b,v17.16b
+ adrp x9, .Lxts_magic
+ ldr q0, [x9, #:lo12:.Lxts_magic]
+ shl v18.16b, v2.16b, #1
+ ext v1.16b, v2.16b, v2.16b,#15
+ ushr v1.16b, v1.16b, #7
+ mul v1.16b, v1.16b, v0.16b
+ eor v18.16b, v18.16b, v1.16b
+ rbit v18.16b,v18.16b
+ b .check_dec_gb
+
+
+// Determine whether encryption or decryption is required.
+// The last two tweaks need to be swapped for decryption.
+.check_dec_gb:
+ // encryption:1 decryption:0
+ cmp w28,1
+ b.eq .process_last_2blks_gb
+ mov v0.16B,v17.16b
+ mov v17.16B,v18.16b
+ mov v18.16B,v0.16b
+
+.process_last_2blks_gb:
+#ifdef __AARCH64EB__
+ rev32 v17.16b,v17.16b
+#endif
+#ifdef __AARCH64EB__
+ rev32 v18.16b,v18.16b
+#endif
+ ld1 {v4.4s},[x0],#16
+ eor v4.16b, v4.16b, v17.16b
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+ mov x10,x3
+ mov w11,#8
+ mov w12,v4.s[0]
+ mov w13,v4.s[1]
+ mov w14,v4.s[2]
+ mov w15,v4.s[3]
+10:
+ ldp w7,w8,[x10],8
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ eor w6,w14,w15
+ eor w9,w7,w13
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w12,w12,w6
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ eor w6,w14,w15
+ eor w9,w12,w8
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ ldp w7,w8,[x10],8
+ eor w13,w13,w6
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ eor w6,w12,w13
+ eor w9,w7,w15
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w14,w14,w6
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ eor w6,w12,w13
+ eor w9,w14,w8
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w15,w15,w6
+ subs w11,w11,#1
+ b.ne 10b
+ mov v4.s[0],w15
+ mov v4.s[1],w14
+ mov v4.s[2],w13
+ mov v4.s[3],w12
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+ eor v4.16b, v4.16b, v17.16b
+ st1 {v4.4s},[x1],#16
+
+ sub x26,x1,16
+.loop_gb:
+ subs x29,x29,1
+ ldrb w7,[x26,x29]
+ ldrb w8,[x0,x29]
+ strb w8,[x26,x29]
+ strb w7,[x1,x29]
+ b.gt .loop_gb
+ ld1 {v4.4s}, [x26]
+ eor v4.16b, v4.16b, v18.16b
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+ mov x10,x3
+ mov w11,#8
+ mov w12,v4.s[0]
+ mov w13,v4.s[1]
+ mov w14,v4.s[2]
+ mov w15,v4.s[3]
+10:
+ ldp w7,w8,[x10],8
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ eor w6,w14,w15
+ eor w9,w7,w13
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w12,w12,w6
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ eor w6,w14,w15
+ eor w9,w12,w8
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ ldp w7,w8,[x10],8
+ eor w13,w13,w6
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ eor w6,w12,w13
+ eor w9,w7,w15
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w14,w14,w6
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ eor w6,w12,w13
+ eor w9,w14,w8
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w15,w15,w6
+ subs w11,w11,#1
+ b.ne 10b
+ mov v4.s[0],w15
+ mov v4.s[1],w14
+ mov v4.s[2],w13
+ mov v4.s[3],w12
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+ eor v4.16b, v4.16b, v18.16b
+ st1 {v4.4s}, [x26]
+.return_gb:
+ ldp d14, d15, [sp], #0x10
+ ldp d12, d13, [sp], #0x10
+ ldp d10, d11, [sp], #0x10
+ ldp d8, d9, [sp], #0x10
+ ldp x29, x30, [sp], #0x10
+ ldp x27, x28, [sp], #0x10
+ ldp x25, x26, [sp], #0x10
+ ldp x23, x24, [sp], #0x10
+ ldp x21, x22, [sp], #0x10
+ ldp x19, x20, [sp], #0x10
+ ldp x17, x18, [sp], #0x10
+ ldp x15, x16, [sp], #0x10
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size vpsm4_ex_xts_encrypt_gb,.-vpsm4_ex_xts_encrypt_gb
+.globl vpsm4_ex_xts_encrypt
+.type vpsm4_ex_xts_encrypt,%function
+.align 5
+vpsm4_ex_xts_encrypt:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x15, x16, [sp, #-0x10]!
+ stp x17, x18, [sp, #-0x10]!
+ stp x19, x20, [sp, #-0x10]!
+ stp x21, x22, [sp, #-0x10]!
+ stp x23, x24, [sp, #-0x10]!
+ stp x25, x26, [sp, #-0x10]!
+ stp x27, x28, [sp, #-0x10]!
+ stp x29, x30, [sp, #-0x10]!
+ stp d8, d9, [sp, #-0x10]!
+ stp d10, d11, [sp, #-0x10]!
+ stp d12, d13, [sp, #-0x10]!
+ stp d14, d15, [sp, #-0x10]!
+ mov x26,x3
+ mov x27,x4
+ mov w28,w6
+ ld1 {v16.4s}, [x5]
+ mov x3,x27
+ adrp x9, .Lsbox_magic
+ ldr q26, [x9, #:lo12:.Lsbox_magic]
+ ldr q27, [x9, #:lo12:.Lsbox_magic+16]
+ ldr q28, [x9, #:lo12:.Lsbox_magic+32]
+ ldr q29, [x9, #:lo12:.Lsbox_magic+48]
+ ldr q30, [x9, #:lo12:.Lsbox_magic+64]
+ ldr q31, [x9, #:lo12:.Lsbox_magic+80]
+#ifndef __AARCH64EB__
+ rev32 v16.16b,v16.16b
+#endif
+ mov x10,x3
+ mov w11,#8
+ mov w12,v16.s[0]
+ mov w13,v16.s[1]
+ mov w14,v16.s[2]
+ mov w15,v16.s[3]
+10:
+ ldp w7,w8,[x10],8
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ eor w6,w14,w15
+ eor w9,w7,w13
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w12,w12,w6
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ eor w6,w14,w15
+ eor w9,w12,w8
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ ldp w7,w8,[x10],8
+ eor w13,w13,w6
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ eor w6,w12,w13
+ eor w9,w7,w15
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w14,w14,w6
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ eor w6,w12,w13
+ eor w9,w14,w8
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w15,w15,w6
+ subs w11,w11,#1
+ b.ne 10b
+ mov v16.s[0],w15
+ mov v16.s[1],w14
+ mov v16.s[2],w13
+ mov v16.s[3],w12
+#ifndef __AARCH64EB__
+ rev32 v16.16b,v16.16b
+#endif
+ mov x3,x26
+ and x29,x2,#0x0F
+ // convert length into blocks
+ lsr x2,x2,4
+ cmp x2,#1
+ b.lt .return
+
+ cmp x29,0
+ // If the encryption/decryption Length is N times of 16,
+ // the all blocks are encrypted/decrypted in .xts_encrypt_blocks
+ b.eq .xts_encrypt_blocks
+
+ // If the encryption/decryption length is not N times of 16,
+ // the last two blocks are encrypted/decrypted in .last_2blks_tweak or .only_2blks_tweak
+ // the other blocks are encrypted/decrypted in .xts_encrypt_blocks
+ subs x2,x2,#1
+ b.eq .only_2blks_tweak
+.xts_encrypt_blocks:
+#ifdef __AARCH64EB__
+ rev32 v16.16b,v16.16b
+#endif
+ mov x12,v16.d[0]
+ mov x13,v16.d[1]
+ mov w7,0x87
+ extr x9,x13,x13,#32
+ extr x15,x13,x12,#63
+ and w8,w7,w9,asr#31
+ eor x14,x8,x12,lsl#1
+ mov w7,0x87
+ extr x9,x15,x15,#32
+ extr x17,x15,x14,#63
+ and w8,w7,w9,asr#31
+ eor x16,x8,x14,lsl#1
+ mov w7,0x87
+ extr x9,x17,x17,#32
+ extr x19,x17,x16,#63
+ and w8,w7,w9,asr#31
+ eor x18,x8,x16,lsl#1
+ mov w7,0x87
+ extr x9,x19,x19,#32
+ extr x21,x19,x18,#63
+ and w8,w7,w9,asr#31
+ eor x20,x8,x18,lsl#1
+ mov w7,0x87
+ extr x9,x21,x21,#32
+ extr x23,x21,x20,#63
+ and w8,w7,w9,asr#31
+ eor x22,x8,x20,lsl#1
+ mov w7,0x87
+ extr x9,x23,x23,#32
+ extr x25,x23,x22,#63
+ and w8,w7,w9,asr#31
+ eor x24,x8,x22,lsl#1
+ mov w7,0x87
+ extr x9,x25,x25,#32
+ extr x27,x25,x24,#63
+ and w8,w7,w9,asr#31
+ eor x26,x8,x24,lsl#1
+.Lxts_8_blocks_process:
+ cmp x2,#8
+ mov v16.d[0],x12
+ mov v16.d[1],x13
+#ifdef __AARCH64EB__
+ rev32 v16.16b,v16.16b
+#endif
+ mov w7,0x87
+ extr x9,x27,x27,#32
+ extr x13,x27,x26,#63
+ and w8,w7,w9,asr#31
+ eor x12,x8,x26,lsl#1
+ mov v17.d[0],x14
+ mov v17.d[1],x15
+#ifdef __AARCH64EB__
+ rev32 v17.16b,v17.16b
+#endif
+ mov w7,0x87
+ extr x9,x13,x13,#32
+ extr x15,x13,x12,#63
+ and w8,w7,w9,asr#31
+ eor x14,x8,x12,lsl#1
+ mov v18.d[0],x16
+ mov v18.d[1],x17
+#ifdef __AARCH64EB__
+ rev32 v18.16b,v18.16b
+#endif
+ mov w7,0x87
+ extr x9,x15,x15,#32
+ extr x17,x15,x14,#63
+ and w8,w7,w9,asr#31
+ eor x16,x8,x14,lsl#1
+ mov v19.d[0],x18
+ mov v19.d[1],x19
+#ifdef __AARCH64EB__
+ rev32 v19.16b,v19.16b
+#endif
+ mov w7,0x87
+ extr x9,x17,x17,#32
+ extr x19,x17,x16,#63
+ and w8,w7,w9,asr#31
+ eor x18,x8,x16,lsl#1
+ mov v20.d[0],x20
+ mov v20.d[1],x21
+#ifdef __AARCH64EB__
+ rev32 v20.16b,v20.16b
+#endif
+ mov w7,0x87
+ extr x9,x19,x19,#32
+ extr x21,x19,x18,#63
+ and w8,w7,w9,asr#31
+ eor x20,x8,x18,lsl#1
+ mov v21.d[0],x22
+ mov v21.d[1],x23
+#ifdef __AARCH64EB__
+ rev32 v21.16b,v21.16b
+#endif
+ mov w7,0x87
+ extr x9,x21,x21,#32
+ extr x23,x21,x20,#63
+ and w8,w7,w9,asr#31
+ eor x22,x8,x20,lsl#1
+ mov v22.d[0],x24
+ mov v22.d[1],x25
+#ifdef __AARCH64EB__
+ rev32 v22.16b,v22.16b
+#endif
+ mov w7,0x87
+ extr x9,x23,x23,#32
+ extr x25,x23,x22,#63
+ and w8,w7,w9,asr#31
+ eor x24,x8,x22,lsl#1
+ mov v23.d[0],x26
+ mov v23.d[1],x27
+#ifdef __AARCH64EB__
+ rev32 v23.16b,v23.16b
+#endif
+ mov w7,0x87
+ extr x9,x25,x25,#32
+ extr x27,x25,x24,#63
+ and w8,w7,w9,asr#31
+ eor x26,x8,x24,lsl#1
+ b.lt .Lxts_4_blocks_process
+ ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
+ eor v4.16b, v4.16b, v16.16b
+ eor v5.16b, v5.16b, v17.16b
+ eor v6.16b, v6.16b, v18.16b
+ eor v7.16b, v7.16b, v19.16b
+ ld1 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64
+ eor v8.16b, v8.16b, v20.16b
+ eor v9.16b, v9.16b, v21.16b
+ eor v10.16b, v10.16b, v22.16b
+ eor v11.16b, v11.16b, v23.16b
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v5.16b,v5.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v6.16b,v6.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v7.16b,v7.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v8.16b,v8.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v9.16b,v9.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v10.16b,v10.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v11.16b,v11.16b
+#endif
+ zip1 v0.4s,v4.4s,v5.4s
+ zip2 v1.4s,v4.4s,v5.4s
+ zip1 v2.4s,v6.4s,v7.4s
+ zip2 v3.4s,v6.4s,v7.4s
+ zip1 v4.2d,v0.2d,v2.2d
+ zip2 v5.2d,v0.2d,v2.2d
+ zip1 v6.2d,v1.2d,v3.2d
+ zip2 v7.2d,v1.2d,v3.2d
+ zip1 v0.4s,v8.4s,v9.4s
+ zip2 v1.4s,v8.4s,v9.4s
+ zip1 v2.4s,v10.4s,v11.4s
+ zip2 v3.4s,v10.4s,v11.4s
+ zip1 v8.2d,v0.2d,v2.2d
+ zip2 v9.2d,v0.2d,v2.2d
+ zip1 v10.2d,v1.2d,v3.2d
+ zip2 v11.2d,v1.2d,v3.2d
+ bl _vpsm4_ex_enc_8blks
+ zip1 v8.4s,v0.4s,v1.4s
+ zip2 v9.4s,v0.4s,v1.4s
+ zip1 v10.4s,v2.4s,v3.4s
+ zip2 v11.4s,v2.4s,v3.4s
+ zip1 v0.2d,v8.2d,v10.2d
+ zip2 v1.2d,v8.2d,v10.2d
+ zip1 v2.2d,v9.2d,v11.2d
+ zip2 v3.2d,v9.2d,v11.2d
+ zip1 v8.4s,v4.4s,v5.4s
+ zip2 v9.4s,v4.4s,v5.4s
+ zip1 v10.4s,v6.4s,v7.4s
+ zip2 v11.4s,v6.4s,v7.4s
+ zip1 v4.2d,v8.2d,v10.2d
+ zip2 v5.2d,v8.2d,v10.2d
+ zip1 v6.2d,v9.2d,v11.2d
+ zip2 v7.2d,v9.2d,v11.2d
+ eor v0.16b, v0.16b, v16.16b
+ eor v1.16b, v1.16b, v17.16b
+ eor v2.16b, v2.16b, v18.16b
+ eor v3.16b, v3.16b, v19.16b
+ eor v4.16b, v4.16b, v20.16b
+ eor v5.16b, v5.16b, v21.16b
+ eor v6.16b, v6.16b, v22.16b
+ eor v7.16b, v7.16b, v23.16b
+
+ // save the last tweak
+ mov v25.16b,v23.16b
+ st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
+ st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64
+ subs x2,x2,#8
+ b.gt .Lxts_8_blocks_process
+ b 100f
+.Lxts_4_blocks_process:
+ cmp x2,#4
+ b.lt 1f
+ ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
+ eor v4.16b, v4.16b, v16.16b
+ eor v5.16b, v5.16b, v17.16b
+ eor v6.16b, v6.16b, v18.16b
+ eor v7.16b, v7.16b, v19.16b
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v5.16b,v5.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v6.16b,v6.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v7.16b,v7.16b
+#endif
+ zip1 v0.4s,v4.4s,v5.4s
+ zip2 v1.4s,v4.4s,v5.4s
+ zip1 v2.4s,v6.4s,v7.4s
+ zip2 v3.4s,v6.4s,v7.4s
+ zip1 v4.2d,v0.2d,v2.2d
+ zip2 v5.2d,v0.2d,v2.2d
+ zip1 v6.2d,v1.2d,v3.2d
+ zip2 v7.2d,v1.2d,v3.2d
+ bl _vpsm4_ex_enc_4blks
+ zip1 v4.4s,v0.4s,v1.4s
+ zip2 v5.4s,v0.4s,v1.4s
+ zip1 v6.4s,v2.4s,v3.4s
+ zip2 v7.4s,v2.4s,v3.4s
+ zip1 v0.2d,v4.2d,v6.2d
+ zip2 v1.2d,v4.2d,v6.2d
+ zip1 v2.2d,v5.2d,v7.2d
+ zip2 v3.2d,v5.2d,v7.2d
+ eor v0.16b, v0.16b, v16.16b
+ eor v1.16b, v1.16b, v17.16b
+ eor v2.16b, v2.16b, v18.16b
+ eor v3.16b, v3.16b, v19.16b
+ st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
+ sub x2,x2,#4
+ mov v16.16b,v20.16b
+ mov v17.16b,v21.16b
+ mov v18.16b,v22.16b
+ // save the last tweak
+ mov v25.16b,v19.16b
+1:
+ // process last block
+ cmp x2,#1
+ b.lt 100f
+ b.gt 1f
+ ld1 {v4.4s},[x0],#16
+ eor v4.16b, v4.16b, v16.16b
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+ mov x10,x3
+ mov w11,#8
+ mov w12,v4.s[0]
+ mov w13,v4.s[1]
+ mov w14,v4.s[2]
+ mov w15,v4.s[3]
+10:
+ ldp w7,w8,[x10],8
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ eor w6,w14,w15
+ eor w9,w7,w13
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w12,w12,w6
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ eor w6,w14,w15
+ eor w9,w12,w8
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ ldp w7,w8,[x10],8
+ eor w13,w13,w6
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ eor w6,w12,w13
+ eor w9,w7,w15
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w14,w14,w6
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ eor w6,w12,w13
+ eor w9,w14,w8
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w15,w15,w6
+ subs w11,w11,#1
+ b.ne 10b
+ mov v4.s[0],w15
+ mov v4.s[1],w14
+ mov v4.s[2],w13
+ mov v4.s[3],w12
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+ eor v4.16b, v4.16b, v16.16b
+ st1 {v4.4s},[x1],#16
+ // save the last tweak
+ mov v25.16b,v16.16b
+ b 100f
+1: // process last 2 blocks
+ cmp x2,#2
+ b.gt 1f
+ ld1 {v4.4s,v5.4s},[x0],#32
+ eor v4.16b, v4.16b, v16.16b
+ eor v5.16b, v5.16b, v17.16b
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v5.16b,v5.16b
+#endif
+ zip1 v0.4s,v4.4s,v5.4s
+ zip2 v1.4s,v4.4s,v5.4s
+ zip1 v2.4s,v6.4s,v7.4s
+ zip2 v3.4s,v6.4s,v7.4s
+ zip1 v4.2d,v0.2d,v2.2d
+ zip2 v5.2d,v0.2d,v2.2d
+ zip1 v6.2d,v1.2d,v3.2d
+ zip2 v7.2d,v1.2d,v3.2d
+ bl _vpsm4_ex_enc_4blks
+ zip1 v4.4s,v0.4s,v1.4s
+ zip2 v5.4s,v0.4s,v1.4s
+ zip1 v6.4s,v2.4s,v3.4s
+ zip2 v7.4s,v2.4s,v3.4s
+ zip1 v0.2d,v4.2d,v6.2d
+ zip2 v1.2d,v4.2d,v6.2d
+ zip1 v2.2d,v5.2d,v7.2d
+ zip2 v3.2d,v5.2d,v7.2d
+ eor v0.16b, v0.16b, v16.16b
+ eor v1.16b, v1.16b, v17.16b
+ st1 {v0.4s,v1.4s},[x1],#32
+ // save the last tweak
+ mov v25.16b,v17.16b
+ b 100f
+1: // process last 3 blocks
+ ld1 {v4.4s,v5.4s,v6.4s},[x0],#48
+ eor v4.16b, v4.16b, v16.16b
+ eor v5.16b, v5.16b, v17.16b
+ eor v6.16b, v6.16b, v18.16b
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v5.16b,v5.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v6.16b,v6.16b
+#endif
+ zip1 v0.4s,v4.4s,v5.4s
+ zip2 v1.4s,v4.4s,v5.4s
+ zip1 v2.4s,v6.4s,v7.4s
+ zip2 v3.4s,v6.4s,v7.4s
+ zip1 v4.2d,v0.2d,v2.2d
+ zip2 v5.2d,v0.2d,v2.2d
+ zip1 v6.2d,v1.2d,v3.2d
+ zip2 v7.2d,v1.2d,v3.2d
+ bl _vpsm4_ex_enc_4blks
+ zip1 v4.4s,v0.4s,v1.4s
+ zip2 v5.4s,v0.4s,v1.4s
+ zip1 v6.4s,v2.4s,v3.4s
+ zip2 v7.4s,v2.4s,v3.4s
+ zip1 v0.2d,v4.2d,v6.2d
+ zip2 v1.2d,v4.2d,v6.2d
+ zip1 v2.2d,v5.2d,v7.2d
+ zip2 v3.2d,v5.2d,v7.2d
+ eor v0.16b, v0.16b, v16.16b
+ eor v1.16b, v1.16b, v17.16b
+ eor v2.16b, v2.16b, v18.16b
+ st1 {v0.4s,v1.4s,v2.4s},[x1],#48
+ // save the last tweak
+ mov v25.16b,v18.16b
+100:
+ cmp x29,0
+ b.eq .return
+
+// This branch calculates the last two tweaks,
+// while the encryption/decryption length is larger than 32
+.last_2blks_tweak:
+#ifdef __AARCH64EB__
+ rev32 v25.16b,v25.16b
+#endif
+ mov v2.16b,v25.16b
+ adrp x9, .Lxts_magic
+ ldr q0, [x9, #:lo12:.Lxts_magic]
+ shl v17.16b, v2.16b, #1
+ ext v1.16b, v2.16b, v2.16b,#15
+ ushr v1.16b, v1.16b, #7
+ mul v1.16b, v1.16b, v0.16b
+ eor v17.16b, v17.16b, v1.16b
+ mov v2.16b,v17.16b
+ adrp x9, .Lxts_magic
+ ldr q0, [x9, #:lo12:.Lxts_magic]
+ shl v18.16b, v2.16b, #1
+ ext v1.16b, v2.16b, v2.16b,#15
+ ushr v1.16b, v1.16b, #7
+ mul v1.16b, v1.16b, v0.16b
+ eor v18.16b, v18.16b, v1.16b
+ b .check_dec
+
+
+// This branch calculates the last two tweaks,
+// while the encryption/decryption length is equal to 32, who only need two tweaks
+.only_2blks_tweak:
+ mov v17.16b,v16.16b
+#ifdef __AARCH64EB__
+ rev32 v17.16b,v17.16b
+#endif
+ mov v2.16b,v17.16b
+ adrp x9, .Lxts_magic
+ ldr q0, [x9, #:lo12:.Lxts_magic]
+ shl v18.16b, v2.16b, #1
+ ext v1.16b, v2.16b, v2.16b,#15
+ ushr v1.16b, v1.16b, #7
+ mul v1.16b, v1.16b, v0.16b
+ eor v18.16b, v18.16b, v1.16b
+ b .check_dec
+
+
+// Determine whether encryption or decryption is required.
+// The last two tweaks need to be swapped for decryption.
+.check_dec:
+ // encryption:1 decryption:0
+ cmp w28,1
+ b.eq .process_last_2blks
+ mov v0.16B,v17.16b
+ mov v17.16B,v18.16b
+ mov v18.16B,v0.16b
+
+.process_last_2blks:
+#ifdef __AARCH64EB__
+ rev32 v17.16b,v17.16b
+#endif
+#ifdef __AARCH64EB__
+ rev32 v18.16b,v18.16b
+#endif
+ ld1 {v4.4s},[x0],#16
+ eor v4.16b, v4.16b, v17.16b
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+ mov x10,x3
+ mov w11,#8
+ mov w12,v4.s[0]
+ mov w13,v4.s[1]
+ mov w14,v4.s[2]
+ mov w15,v4.s[3]
+10:
+ ldp w7,w8,[x10],8
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ eor w6,w14,w15
+ eor w9,w7,w13
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w12,w12,w6
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ eor w6,w14,w15
+ eor w9,w12,w8
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ ldp w7,w8,[x10],8
+ eor w13,w13,w6
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ eor w6,w12,w13
+ eor w9,w7,w15
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w14,w14,w6
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ eor w6,w12,w13
+ eor w9,w14,w8
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w15,w15,w6
+ subs w11,w11,#1
+ b.ne 10b
+ mov v4.s[0],w15
+ mov v4.s[1],w14
+ mov v4.s[2],w13
+ mov v4.s[3],w12
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+ eor v4.16b, v4.16b, v17.16b
+ st1 {v4.4s},[x1],#16
+
+ sub x26,x1,16
+.loop:
+ subs x29,x29,1
+ ldrb w7,[x26,x29]
+ ldrb w8,[x0,x29]
+ strb w8,[x26,x29]
+ strb w7,[x1,x29]
+ b.gt .loop
+ ld1 {v4.4s}, [x26]
+ eor v4.16b, v4.16b, v18.16b
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+ mov x10,x3
+ mov w11,#8
+ mov w12,v4.s[0]
+ mov w13,v4.s[1]
+ mov w14,v4.s[2]
+ mov w15,v4.s[3]
+10:
+ ldp w7,w8,[x10],8
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ eor w6,w14,w15
+ eor w9,w7,w13
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w12,w12,w6
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ eor w6,w14,w15
+ eor w9,w12,w8
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ ldp w7,w8,[x10],8
+ eor w13,w13,w6
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ eor w6,w12,w13
+ eor w9,w7,w15
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w14,w14,w6
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ eor w6,w12,w13
+ eor w9,w14,w8
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w15,w15,w6
+ subs w11,w11,#1
+ b.ne 10b
+ mov v4.s[0],w15
+ mov v4.s[1],w14
+ mov v4.s[2],w13
+ mov v4.s[3],w12
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+ eor v4.16b, v4.16b, v18.16b
+ st1 {v4.4s}, [x26]
+.return:
+ ldp d14, d15, [sp], #0x10
+ ldp d12, d13, [sp], #0x10
+ ldp d10, d11, [sp], #0x10
+ ldp d8, d9, [sp], #0x10
+ ldp x29, x30, [sp], #0x10
+ ldp x27, x28, [sp], #0x10
+ ldp x25, x26, [sp], #0x10
+ ldp x23, x24, [sp], #0x10
+ ldp x21, x22, [sp], #0x10
+ ldp x19, x20, [sp], #0x10
+ ldp x17, x18, [sp], #0x10
+ ldp x15, x16, [sp], #0x10
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size vpsm4_ex_xts_encrypt,.-vpsm4_ex_xts_encrypt