aboutsummaryrefslogtreecommitdiff
path: root/sys/crypto/openssl/aarch64/vpsm4_ex-armv8.S
diff options
context:
space:
mode:
Diffstat (limited to 'sys/crypto/openssl/aarch64/vpsm4_ex-armv8.S')
-rw-r--r--sys/crypto/openssl/aarch64/vpsm4_ex-armv8.S4523
1 files changed, 4523 insertions, 0 deletions
diff --git a/sys/crypto/openssl/aarch64/vpsm4_ex-armv8.S b/sys/crypto/openssl/aarch64/vpsm4_ex-armv8.S
new file mode 100644
index 000000000000..5627d6d1c6b4
--- /dev/null
+++ b/sys/crypto/openssl/aarch64/vpsm4_ex-armv8.S
@@ -0,0 +1,4523 @@
+/* Do not modify. This file is auto-generated from vpsm4_ex-armv8.pl. */
+// Copyright 2022-2025 The OpenSSL Project Authors. All Rights Reserved.
+//
+// Licensed under the Apache License 2.0 (the "License"). You may not use
+// this file except in compliance with the License. You can obtain a copy
+// in the file LICENSE in the source distribution or at
+// https://www.openssl.org/source/license.html
+
+//
+// This module implements SM4 with ASIMD and AESE on AARCH64
+//
+// Dec 2022
+//
+
+// $output is the last argument if it looks like a file (it has an extension)
+// $flavour is the first argument if it doesn't look like a file
+#include "arm_arch.h"
+.arch armv8-a+crypto
+.text
+
+.type _vpsm4_ex_consts,%object
+.align 7
+_vpsm4_ex_consts:
+.Lck:
+.long 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269
+.long 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9
+.long 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249
+.long 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9
+.long 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229
+.long 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299
+.long 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209
+.long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279
+.Lfk:
+.quad 0x56aa3350a3b1bac6,0xb27022dc677d9197
+.Lshuffles:
+.quad 0x0B0A090807060504,0x030201000F0E0D0C
+.Lxts_magic:
+.quad 0x0101010101010187,0x0101010101010101
+.Lsbox_magic:
+.quad 0x0b0e0104070a0d00,0x0306090c0f020508
+.quad 0x62185a2042387a00,0x22581a6002783a40
+.quad 0x15df62a89e54e923,0xc10bb67c4a803df7
+.quad 0xb9aa6b78c1d21300,0x1407c6d56c7fbead
+.quad 0x6404462679195b3b,0xe383c1a1fe9edcbc
+.quad 0x0f0f0f0f0f0f0f0f,0x0f0f0f0f0f0f0f0f
+
+.size _vpsm4_ex_consts,.-_vpsm4_ex_consts
+.type _vpsm4_ex_set_key,%function
+.align 4
+_vpsm4_ex_set_key:
+ AARCH64_VALID_CALL_TARGET
+ ld1 {v5.4s},[x0]
+ adrp x9, .Lsbox_magic
+ ldr q26, [x9, #:lo12:.Lsbox_magic]
+ ldr q27, [x9, #:lo12:.Lsbox_magic+16]
+ ldr q28, [x9, #:lo12:.Lsbox_magic+32]
+ ldr q29, [x9, #:lo12:.Lsbox_magic+48]
+ ldr q30, [x9, #:lo12:.Lsbox_magic+64]
+ ldr q31, [x9, #:lo12:.Lsbox_magic+80]
+#ifndef __AARCH64EB__
+ rev32 v5.16b,v5.16b
+#endif
+ adrp x5,.Lshuffles
+ add x5,x5,#:lo12:.Lshuffles
+ ld1 {v7.2d},[x5]
+ adrp x5,.Lfk
+ add x5,x5,#:lo12:.Lfk
+ ld1 {v6.2d},[x5]
+ eor v5.16b,v5.16b,v6.16b
+ mov x6,#32
+ adrp x5,.Lck
+ add x5,x5,#:lo12:.Lck
+ movi v0.16b,#64
+ cbnz w2,1f
+ add x1,x1,124
+1:
+ mov w7,v5.s[1]
+ ldr w8,[x5],#4
+ eor w8,w8,w7
+ mov w7,v5.s[2]
+ eor w8,w8,w7
+ mov w7,v5.s[3]
+ eor w8,w8,w7
+ // optimize sbox using AESE instruction
+ mov v4.s[0],w8
+ tbl v0.16b, {v4.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ mov w7,v0.s[0]
+ eor w8,w7,w7,ror #19
+ eor w8,w8,w7,ror #9
+ mov w7,v5.s[0]
+ eor w8,w8,w7
+ mov v5.s[0],w8
+ cbz w2,2f
+ str w8,[x1],#4
+ b 3f
+2:
+ str w8,[x1],#-4
+3:
+ tbl v5.16b,{v5.16b},v7.16b
+ subs x6,x6,#1
+ b.ne 1b
+ ret
+.size _vpsm4_ex_set_key,.-_vpsm4_ex_set_key
+.type _vpsm4_ex_enc_4blks,%function
+.align 4
+_vpsm4_ex_enc_4blks:
+ AARCH64_VALID_CALL_TARGET
+ mov x10,x3
+ mov w11,#8
+10:
+ ldp w7,w8,[x10],8
+ dup v12.4s,w7
+ dup v13.4s,w8
+
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ eor v14.16b,v6.16b,v7.16b
+ eor v12.16b,v5.16b,v12.16b
+ eor v12.16b,v14.16b,v12.16b
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v12.16b}, v26.16b
+ ushr v24.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v24.16b, {v27.16b}, v24.16b
+ eor v0.16b, v0.16b, v24.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v24.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v24.16b, {v29.16b}, v24.16b
+ eor v0.16b, v0.16b, v24.16b
+ mov v12.16b,v0.16b
+
+ // linear transformation
+ ushr v0.4s,v12.4s,32-2
+ ushr v1.4s,v12.4s,32-10
+ ushr v2.4s,v12.4s,32-18
+ ushr v3.4s,v12.4s,32-24
+ sli v0.4s,v12.4s,2
+ sli v1.4s,v12.4s,10
+ sli v2.4s,v12.4s,18
+ sli v3.4s,v12.4s,24
+ eor v24.16b,v0.16b,v12.16b
+ eor v24.16b,v24.16b,v1.16b
+ eor v12.16b,v2.16b,v3.16b
+ eor v12.16b,v12.16b,v24.16b
+ eor v4.16b,v4.16b,v12.16b
+
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ eor v14.16b,v14.16b,v4.16b
+ eor v13.16b,v14.16b,v13.16b
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v13.16b}, v26.16b
+ ushr v24.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v24.16b, {v27.16b}, v24.16b
+ eor v0.16b, v0.16b, v24.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v24.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v24.16b, {v29.16b}, v24.16b
+ eor v0.16b, v0.16b, v24.16b
+ mov v13.16b,v0.16b
+
+ // linear transformation
+ ushr v0.4s,v13.4s,32-2
+ ushr v1.4s,v13.4s,32-10
+ ushr v2.4s,v13.4s,32-18
+ ushr v3.4s,v13.4s,32-24
+ sli v0.4s,v13.4s,2
+ sli v1.4s,v13.4s,10
+ sli v2.4s,v13.4s,18
+ sli v3.4s,v13.4s,24
+ eor v24.16b,v0.16b,v13.16b
+ eor v24.16b,v24.16b,v1.16b
+ eor v13.16b,v2.16b,v3.16b
+ eor v13.16b,v13.16b,v24.16b
+ ldp w7,w8,[x10],8
+ eor v5.16b,v5.16b,v13.16b
+
+ dup v12.4s,w7
+ dup v13.4s,w8
+
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ eor v14.16b,v4.16b,v5.16b
+ eor v12.16b,v7.16b,v12.16b
+ eor v12.16b,v14.16b,v12.16b
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v12.16b}, v26.16b
+ ushr v24.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v24.16b, {v27.16b}, v24.16b
+ eor v0.16b, v0.16b, v24.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v24.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v24.16b, {v29.16b}, v24.16b
+ eor v0.16b, v0.16b, v24.16b
+ mov v12.16b,v0.16b
+
+ // linear transformation
+ ushr v0.4s,v12.4s,32-2
+ ushr v1.4s,v12.4s,32-10
+ ushr v2.4s,v12.4s,32-18
+ ushr v3.4s,v12.4s,32-24
+ sli v0.4s,v12.4s,2
+ sli v1.4s,v12.4s,10
+ sli v2.4s,v12.4s,18
+ sli v3.4s,v12.4s,24
+ eor v24.16b,v0.16b,v12.16b
+ eor v24.16b,v24.16b,v1.16b
+ eor v12.16b,v2.16b,v3.16b
+ eor v12.16b,v12.16b,v24.16b
+ eor v6.16b,v6.16b,v12.16b
+
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ eor v14.16b,v14.16b,v6.16b
+ eor v13.16b,v14.16b,v13.16b
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v13.16b}, v26.16b
+ ushr v24.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v24.16b, {v27.16b}, v24.16b
+ eor v0.16b, v0.16b, v24.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v24.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v24.16b, {v29.16b}, v24.16b
+ eor v0.16b, v0.16b, v24.16b
+ mov v13.16b,v0.16b
+
+ // linear transformation
+ ushr v0.4s,v13.4s,32-2
+ ushr v1.4s,v13.4s,32-10
+ ushr v2.4s,v13.4s,32-18
+ ushr v3.4s,v13.4s,32-24
+ sli v0.4s,v13.4s,2
+ sli v1.4s,v13.4s,10
+ sli v2.4s,v13.4s,18
+ sli v3.4s,v13.4s,24
+ eor v24.16b,v0.16b,v13.16b
+ eor v24.16b,v24.16b,v1.16b
+ eor v13.16b,v2.16b,v3.16b
+ eor v13.16b,v13.16b,v24.16b
+ eor v7.16b,v7.16b,v13.16b
+ subs w11,w11,#1
+ b.ne 10b
+#ifndef __AARCH64EB__
+ rev32 v3.16b,v4.16b
+#else
+ mov v3.16b,v4.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v2.16b,v5.16b
+#else
+ mov v2.16b,v5.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v1.16b,v6.16b
+#else
+ mov v1.16b,v6.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v0.16b,v7.16b
+#else
+ mov v0.16b,v7.16b
+#endif
+ ret
+.size _vpsm4_ex_enc_4blks,.-_vpsm4_ex_enc_4blks
+.type _vpsm4_ex_enc_8blks,%function
+.align 4
+_vpsm4_ex_enc_8blks:
+ AARCH64_VALID_CALL_TARGET
+ mov x10,x3
+ mov w11,#8
+10:
+ ldp w7,w8,[x10],8
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ dup v12.4s,w7
+ eor v14.16b,v6.16b,v7.16b
+ eor v15.16b,v10.16b,v11.16b
+ eor v0.16b,v5.16b,v12.16b
+ eor v1.16b,v9.16b,v12.16b
+ eor v12.16b,v14.16b,v0.16b
+ eor v13.16b,v15.16b,v1.16b
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v12.16b}, v26.16b
+ tbl v1.16b, {v13.16b}, v26.16b
+ ushr v24.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v24.16b, {v27.16b}, v24.16b
+ eor v0.16b, v0.16b, v24.16b
+ ushr v24.16b, v1.16b, 4
+ and v1.16b, v1.16b, v31.16b
+ tbl v1.16b, {v28.16b}, v1.16b
+ tbl v24.16b, {v27.16b}, v24.16b
+ eor v1.16b, v1.16b, v24.16b
+ eor v25.16b, v25.16b, v25.16b
+ aese v0.16b,v25.16b
+ aese v1.16b,v25.16b
+ ushr v24.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v24.16b, {v29.16b}, v24.16b
+ eor v0.16b, v0.16b, v24.16b
+ ushr v24.16b, v1.16b, 4
+ and v1.16b, v1.16b, v31.16b
+ tbl v1.16b, {v30.16b}, v1.16b
+ tbl v24.16b, {v29.16b}, v24.16b
+ eor v1.16b, v1.16b, v24.16b
+ mov v12.16b,v0.16b
+ mov v13.16b,v1.16b
+
+ // linear transformation
+ ushr v0.4s,v12.4s,32-2
+ ushr v25.4s,v13.4s,32-2
+ ushr v1.4s,v12.4s,32-10
+ ushr v2.4s,v12.4s,32-18
+ ushr v3.4s,v12.4s,32-24
+ sli v0.4s,v12.4s,2
+ sli v25.4s,v13.4s,2
+ sli v1.4s,v12.4s,10
+ sli v2.4s,v12.4s,18
+ sli v3.4s,v12.4s,24
+ eor v24.16b,v0.16b,v12.16b
+ eor v24.16b,v24.16b,v1.16b
+ eor v12.16b,v2.16b,v3.16b
+ eor v12.16b,v12.16b,v24.16b
+ ushr v1.4s,v13.4s,32-10
+ ushr v2.4s,v13.4s,32-18
+ ushr v3.4s,v13.4s,32-24
+ sli v1.4s,v13.4s,10
+ sli v2.4s,v13.4s,18
+ sli v3.4s,v13.4s,24
+ eor v24.16b,v25.16b,v13.16b
+ eor v24.16b,v24.16b,v1.16b
+ eor v13.16b,v2.16b,v3.16b
+ eor v13.16b,v13.16b,v24.16b
+ eor v4.16b,v4.16b,v12.16b
+ eor v8.16b,v8.16b,v13.16b
+
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ dup v13.4s,w8
+ eor v14.16b,v14.16b,v4.16b
+ eor v15.16b,v15.16b,v8.16b
+ eor v12.16b,v14.16b,v13.16b
+ eor v13.16b,v15.16b,v13.16b
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v12.16b}, v26.16b
+ tbl v1.16b, {v13.16b}, v26.16b
+ ushr v24.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v24.16b, {v27.16b}, v24.16b
+ eor v0.16b, v0.16b, v24.16b
+ ushr v24.16b, v1.16b, 4
+ and v1.16b, v1.16b, v31.16b
+ tbl v1.16b, {v28.16b}, v1.16b
+ tbl v24.16b, {v27.16b}, v24.16b
+ eor v1.16b, v1.16b, v24.16b
+ eor v25.16b, v25.16b, v25.16b
+ aese v0.16b,v25.16b
+ aese v1.16b,v25.16b
+ ushr v24.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v24.16b, {v29.16b}, v24.16b
+ eor v0.16b, v0.16b, v24.16b
+ ushr v24.16b, v1.16b, 4
+ and v1.16b, v1.16b, v31.16b
+ tbl v1.16b, {v30.16b}, v1.16b
+ tbl v24.16b, {v29.16b}, v24.16b
+ eor v1.16b, v1.16b, v24.16b
+ mov v12.16b,v0.16b
+ mov v13.16b,v1.16b
+
+ // linear transformation
+ ushr v0.4s,v12.4s,32-2
+ ushr v25.4s,v13.4s,32-2
+ ushr v1.4s,v12.4s,32-10
+ ushr v2.4s,v12.4s,32-18
+ ushr v3.4s,v12.4s,32-24
+ sli v0.4s,v12.4s,2
+ sli v25.4s,v13.4s,2
+ sli v1.4s,v12.4s,10
+ sli v2.4s,v12.4s,18
+ sli v3.4s,v12.4s,24
+ eor v24.16b,v0.16b,v12.16b
+ eor v24.16b,v24.16b,v1.16b
+ eor v12.16b,v2.16b,v3.16b
+ eor v12.16b,v12.16b,v24.16b
+ ushr v1.4s,v13.4s,32-10
+ ushr v2.4s,v13.4s,32-18
+ ushr v3.4s,v13.4s,32-24
+ sli v1.4s,v13.4s,10
+ sli v2.4s,v13.4s,18
+ sli v3.4s,v13.4s,24
+ eor v24.16b,v25.16b,v13.16b
+ eor v24.16b,v24.16b,v1.16b
+ eor v13.16b,v2.16b,v3.16b
+ eor v13.16b,v13.16b,v24.16b
+ ldp w7,w8,[x10],8
+ eor v5.16b,v5.16b,v12.16b
+ eor v9.16b,v9.16b,v13.16b
+
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ dup v12.4s,w7
+ eor v14.16b,v4.16b,v5.16b
+ eor v15.16b,v8.16b,v9.16b
+ eor v0.16b,v7.16b,v12.16b
+ eor v1.16b,v11.16b,v12.16b
+ eor v12.16b,v14.16b,v0.16b
+ eor v13.16b,v15.16b,v1.16b
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v12.16b}, v26.16b
+ tbl v1.16b, {v13.16b}, v26.16b
+ ushr v24.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v24.16b, {v27.16b}, v24.16b
+ eor v0.16b, v0.16b, v24.16b
+ ushr v24.16b, v1.16b, 4
+ and v1.16b, v1.16b, v31.16b
+ tbl v1.16b, {v28.16b}, v1.16b
+ tbl v24.16b, {v27.16b}, v24.16b
+ eor v1.16b, v1.16b, v24.16b
+ eor v25.16b, v25.16b, v25.16b
+ aese v0.16b,v25.16b
+ aese v1.16b,v25.16b
+ ushr v24.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v24.16b, {v29.16b}, v24.16b
+ eor v0.16b, v0.16b, v24.16b
+ ushr v24.16b, v1.16b, 4
+ and v1.16b, v1.16b, v31.16b
+ tbl v1.16b, {v30.16b}, v1.16b
+ tbl v24.16b, {v29.16b}, v24.16b
+ eor v1.16b, v1.16b, v24.16b
+ mov v12.16b,v0.16b
+ mov v13.16b,v1.16b
+
+ // linear transformation
+ ushr v0.4s,v12.4s,32-2
+ ushr v25.4s,v13.4s,32-2
+ ushr v1.4s,v12.4s,32-10
+ ushr v2.4s,v12.4s,32-18
+ ushr v3.4s,v12.4s,32-24
+ sli v0.4s,v12.4s,2
+ sli v25.4s,v13.4s,2
+ sli v1.4s,v12.4s,10
+ sli v2.4s,v12.4s,18
+ sli v3.4s,v12.4s,24
+ eor v24.16b,v0.16b,v12.16b
+ eor v24.16b,v24.16b,v1.16b
+ eor v12.16b,v2.16b,v3.16b
+ eor v12.16b,v12.16b,v24.16b
+ ushr v1.4s,v13.4s,32-10
+ ushr v2.4s,v13.4s,32-18
+ ushr v3.4s,v13.4s,32-24
+ sli v1.4s,v13.4s,10
+ sli v2.4s,v13.4s,18
+ sli v3.4s,v13.4s,24
+ eor v24.16b,v25.16b,v13.16b
+ eor v24.16b,v24.16b,v1.16b
+ eor v13.16b,v2.16b,v3.16b
+ eor v13.16b,v13.16b,v24.16b
+ eor v6.16b,v6.16b,v12.16b
+ eor v10.16b,v10.16b,v13.16b
+
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ dup v13.4s,w8
+ eor v14.16b,v14.16b,v6.16b
+ eor v15.16b,v15.16b,v10.16b
+ eor v12.16b,v14.16b,v13.16b
+ eor v13.16b,v15.16b,v13.16b
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v12.16b}, v26.16b
+ tbl v1.16b, {v13.16b}, v26.16b
+ ushr v24.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v24.16b, {v27.16b}, v24.16b
+ eor v0.16b, v0.16b, v24.16b
+ ushr v24.16b, v1.16b, 4
+ and v1.16b, v1.16b, v31.16b
+ tbl v1.16b, {v28.16b}, v1.16b
+ tbl v24.16b, {v27.16b}, v24.16b
+ eor v1.16b, v1.16b, v24.16b
+ eor v25.16b, v25.16b, v25.16b
+ aese v0.16b,v25.16b
+ aese v1.16b,v25.16b
+ ushr v24.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v24.16b, {v29.16b}, v24.16b
+ eor v0.16b, v0.16b, v24.16b
+ ushr v24.16b, v1.16b, 4
+ and v1.16b, v1.16b, v31.16b
+ tbl v1.16b, {v30.16b}, v1.16b
+ tbl v24.16b, {v29.16b}, v24.16b
+ eor v1.16b, v1.16b, v24.16b
+ mov v12.16b,v0.16b
+ mov v13.16b,v1.16b
+
+ // linear transformation
+ ushr v0.4s,v12.4s,32-2
+ ushr v25.4s,v13.4s,32-2
+ ushr v1.4s,v12.4s,32-10
+ ushr v2.4s,v12.4s,32-18
+ ushr v3.4s,v12.4s,32-24
+ sli v0.4s,v12.4s,2
+ sli v25.4s,v13.4s,2
+ sli v1.4s,v12.4s,10
+ sli v2.4s,v12.4s,18
+ sli v3.4s,v12.4s,24
+ eor v24.16b,v0.16b,v12.16b
+ eor v24.16b,v24.16b,v1.16b
+ eor v12.16b,v2.16b,v3.16b
+ eor v12.16b,v12.16b,v24.16b
+ ushr v1.4s,v13.4s,32-10
+ ushr v2.4s,v13.4s,32-18
+ ushr v3.4s,v13.4s,32-24
+ sli v1.4s,v13.4s,10
+ sli v2.4s,v13.4s,18
+ sli v3.4s,v13.4s,24
+ eor v24.16b,v25.16b,v13.16b
+ eor v24.16b,v24.16b,v1.16b
+ eor v13.16b,v2.16b,v3.16b
+ eor v13.16b,v13.16b,v24.16b
+ eor v7.16b,v7.16b,v12.16b
+ eor v11.16b,v11.16b,v13.16b
+ subs w11,w11,#1
+ b.ne 10b
+#ifndef __AARCH64EB__
+ rev32 v3.16b,v4.16b
+#else
+ mov v3.16b,v4.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v2.16b,v5.16b
+#else
+ mov v2.16b,v5.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v1.16b,v6.16b
+#else
+ mov v1.16b,v6.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v0.16b,v7.16b
+#else
+ mov v0.16b,v7.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v7.16b,v8.16b
+#else
+ mov v7.16b,v8.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v6.16b,v9.16b
+#else
+ mov v6.16b,v9.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v5.16b,v10.16b
+#else
+ mov v5.16b,v10.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v11.16b
+#else
+ mov v4.16b,v11.16b
+#endif
+ ret
+.size _vpsm4_ex_enc_8blks,.-_vpsm4_ex_enc_8blks
+.globl vpsm4_ex_set_encrypt_key
+.type vpsm4_ex_set_encrypt_key,%function
+.align 5
+vpsm4_ex_set_encrypt_key:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-16]!
+ mov w2,1
+ bl _vpsm4_ex_set_key
+ ldp x29,x30,[sp],#16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size vpsm4_ex_set_encrypt_key,.-vpsm4_ex_set_encrypt_key
+.globl vpsm4_ex_set_decrypt_key
+.type vpsm4_ex_set_decrypt_key,%function
+.align 5
+vpsm4_ex_set_decrypt_key:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-16]!
+ mov w2,0
+ bl _vpsm4_ex_set_key
+ ldp x29,x30,[sp],#16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size vpsm4_ex_set_decrypt_key,.-vpsm4_ex_set_decrypt_key
+.globl vpsm4_ex_encrypt
+.type vpsm4_ex_encrypt,%function
+.align 5
+vpsm4_ex_encrypt:
+ AARCH64_VALID_CALL_TARGET
+ ld1 {v4.4s},[x0]
+ adrp x9, .Lsbox_magic
+ ldr q26, [x9, #:lo12:.Lsbox_magic]
+ ldr q27, [x9, #:lo12:.Lsbox_magic+16]
+ ldr q28, [x9, #:lo12:.Lsbox_magic+32]
+ ldr q29, [x9, #:lo12:.Lsbox_magic+48]
+ ldr q30, [x9, #:lo12:.Lsbox_magic+64]
+ ldr q31, [x9, #:lo12:.Lsbox_magic+80]
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+ mov x3,x2
+ mov x10,x3
+ mov w11,#8
+ mov w12,v4.s[0]
+ mov w13,v4.s[1]
+ mov w14,v4.s[2]
+ mov w15,v4.s[3]
+10:
+ ldp w7,w8,[x10],8
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ eor w6,w14,w15
+ eor w9,w7,w13
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w12,w12,w6
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ eor w6,w14,w15
+ eor w9,w12,w8
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ ldp w7,w8,[x10],8
+ eor w13,w13,w6
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ eor w6,w12,w13
+ eor w9,w7,w15
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w14,w14,w6
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ eor w6,w12,w13
+ eor w9,w14,w8
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w15,w15,w6
+ subs w11,w11,#1
+ b.ne 10b
+ mov v4.s[0],w15
+ mov v4.s[1],w14
+ mov v4.s[2],w13
+ mov v4.s[3],w12
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+ st1 {v4.4s},[x1]
+ ret
+.size vpsm4_ex_encrypt,.-vpsm4_ex_encrypt
+.globl vpsm4_ex_decrypt
+.type vpsm4_ex_decrypt,%function
+.align 5
+vpsm4_ex_decrypt:
+ AARCH64_VALID_CALL_TARGET
+ ld1 {v4.4s},[x0]
+ adrp x9, .Lsbox_magic
+ ldr q26, [x9, #:lo12:.Lsbox_magic]
+ ldr q27, [x9, #:lo12:.Lsbox_magic+16]
+ ldr q28, [x9, #:lo12:.Lsbox_magic+32]
+ ldr q29, [x9, #:lo12:.Lsbox_magic+48]
+ ldr q30, [x9, #:lo12:.Lsbox_magic+64]
+ ldr q31, [x9, #:lo12:.Lsbox_magic+80]
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+ mov x3,x2
+ mov x10,x3
+ mov w11,#8
+ mov w12,v4.s[0]
+ mov w13,v4.s[1]
+ mov w14,v4.s[2]
+ mov w15,v4.s[3]
+10:
+ ldp w7,w8,[x10],8
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ eor w6,w14,w15
+ eor w9,w7,w13
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w12,w12,w6
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ eor w6,w14,w15
+ eor w9,w12,w8
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ ldp w7,w8,[x10],8
+ eor w13,w13,w6
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ eor w6,w12,w13
+ eor w9,w7,w15
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w14,w14,w6
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ eor w6,w12,w13
+ eor w9,w14,w8
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w15,w15,w6
+ subs w11,w11,#1
+ b.ne 10b
+ mov v4.s[0],w15
+ mov v4.s[1],w14
+ mov v4.s[2],w13
+ mov v4.s[3],w12
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+ st1 {v4.4s},[x1]
+ ret
+.size vpsm4_ex_decrypt,.-vpsm4_ex_decrypt
+.globl vpsm4_ex_ecb_encrypt
+.type vpsm4_ex_ecb_encrypt,%function
+.align 5
+vpsm4_ex_ecb_encrypt:
+ AARCH64_SIGN_LINK_REGISTER
+ // convert length into blocks
+ lsr x2,x2,4
+ stp d8,d9,[sp,#-80]!
+ stp d10,d11,[sp,#16]
+ stp d12,d13,[sp,#32]
+ stp d14,d15,[sp,#48]
+ stp x29,x30,[sp,#64]
+ adrp x9, .Lsbox_magic
+ ldr q26, [x9, #:lo12:.Lsbox_magic]
+ ldr q27, [x9, #:lo12:.Lsbox_magic+16]
+ ldr q28, [x9, #:lo12:.Lsbox_magic+32]
+ ldr q29, [x9, #:lo12:.Lsbox_magic+48]
+ ldr q30, [x9, #:lo12:.Lsbox_magic+64]
+ ldr q31, [x9, #:lo12:.Lsbox_magic+80]
+.Lecb_8_blocks_process:
+ cmp w2,#8
+ b.lt .Lecb_4_blocks_process
+ ld4 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
+ ld4 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v5.16b,v5.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v6.16b,v6.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v7.16b,v7.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v8.16b,v8.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v9.16b,v9.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v10.16b,v10.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v11.16b,v11.16b
+#endif
+ bl _vpsm4_ex_enc_8blks
+ st4 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
+ st4 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64
+ subs w2,w2,#8
+ b.gt .Lecb_8_blocks_process
+ b 100f
+.Lecb_4_blocks_process:
+ cmp w2,#4
+ b.lt 1f
+ ld4 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v5.16b,v5.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v6.16b,v6.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v7.16b,v7.16b
+#endif
+ bl _vpsm4_ex_enc_4blks
+ st4 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
+ sub w2,w2,#4
+1:
+ // process last block
+ cmp w2,#1
+ b.lt 100f
+ b.gt 1f
+ ld1 {v4.4s},[x0]
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+ mov x10,x3
+ mov w11,#8
+ mov w12,v4.s[0]
+ mov w13,v4.s[1]
+ mov w14,v4.s[2]
+ mov w15,v4.s[3]
+10:
+ ldp w7,w8,[x10],8
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ eor w6,w14,w15
+ eor w9,w7,w13
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w12,w12,w6
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ eor w6,w14,w15
+ eor w9,w12,w8
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ ldp w7,w8,[x10],8
+ eor w13,w13,w6
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ eor w6,w12,w13
+ eor w9,w7,w15
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w14,w14,w6
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ eor w6,w12,w13
+ eor w9,w14,w8
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w15,w15,w6
+ subs w11,w11,#1
+ b.ne 10b
+ mov v4.s[0],w15
+ mov v4.s[1],w14
+ mov v4.s[2],w13
+ mov v4.s[3],w12
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+ st1 {v4.4s},[x1]
+ b 100f
+1: // process last 2 blocks
+ ld4 {v4.s,v5.s,v6.s,v7.s}[0],[x0],#16
+ ld4 {v4.s,v5.s,v6.s,v7.s}[1],[x0],#16
+ cmp w2,#2
+ b.gt 1f
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v5.16b,v5.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v6.16b,v6.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v7.16b,v7.16b
+#endif
+ bl _vpsm4_ex_enc_4blks
+ st4 {v0.s,v1.s,v2.s,v3.s}[0],[x1],#16
+ st4 {v0.s,v1.s,v2.s,v3.s}[1],[x1]
+ b 100f
+1: // process last 3 blocks
+ ld4 {v4.s,v5.s,v6.s,v7.s}[2],[x0],#16
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v5.16b,v5.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v6.16b,v6.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v7.16b,v7.16b
+#endif
+ bl _vpsm4_ex_enc_4blks
+ st4 {v0.s,v1.s,v2.s,v3.s}[0],[x1],#16
+ st4 {v0.s,v1.s,v2.s,v3.s}[1],[x1],#16
+ st4 {v0.s,v1.s,v2.s,v3.s}[2],[x1]
+100:
+ ldp d10,d11,[sp,#16]
+ ldp d12,d13,[sp,#32]
+ ldp d14,d15,[sp,#48]
+ ldp x29,x30,[sp,#64]
+ ldp d8,d9,[sp],#80
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size vpsm4_ex_ecb_encrypt,.-vpsm4_ex_ecb_encrypt
+.globl vpsm4_ex_cbc_encrypt
+.type vpsm4_ex_cbc_encrypt,%function
+.align 5
+vpsm4_ex_cbc_encrypt:
+ AARCH64_VALID_CALL_TARGET
+ lsr x2,x2,4
+ adrp x9, .Lsbox_magic
+ ldr q26, [x9, #:lo12:.Lsbox_magic]
+ ldr q27, [x9, #:lo12:.Lsbox_magic+16]
+ ldr q28, [x9, #:lo12:.Lsbox_magic+32]
+ ldr q29, [x9, #:lo12:.Lsbox_magic+48]
+ ldr q30, [x9, #:lo12:.Lsbox_magic+64]
+ ldr q31, [x9, #:lo12:.Lsbox_magic+80]
+ cbz w5,.Ldec
+ ld1 {v3.4s},[x4]
+.Lcbc_4_blocks_enc:
+ cmp w2,#4
+ b.lt 1f
+ ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
+ eor v4.16b,v4.16b,v3.16b
+#ifndef __AARCH64EB__
+ rev32 v5.16b,v5.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v6.16b,v6.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v7.16b,v7.16b
+#endif
+ mov x10,x3
+ mov w11,#8
+ mov w12,v4.s[0]
+ mov w13,v4.s[1]
+ mov w14,v4.s[2]
+ mov w15,v4.s[3]
+10:
+ ldp w7,w8,[x10],8
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ eor w6,w14,w15
+ eor w9,w7,w13
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w12,w12,w6
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ eor w6,w14,w15
+ eor w9,w12,w8
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ ldp w7,w8,[x10],8
+ eor w13,w13,w6
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ eor w6,w12,w13
+ eor w9,w7,w15
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w14,w14,w6
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ eor w6,w12,w13
+ eor w9,w14,w8
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w15,w15,w6
+ subs w11,w11,#1
+ b.ne 10b
+ mov v4.s[0],w15
+ mov v4.s[1],w14
+ mov v4.s[2],w13
+ mov v4.s[3],w12
+ eor v5.16b,v5.16b,v4.16b
+ mov x10,x3
+ mov w11,#8
+ mov w12,v5.s[0]
+ mov w13,v5.s[1]
+ mov w14,v5.s[2]
+ mov w15,v5.s[3]
+10:
+ ldp w7,w8,[x10],8
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ eor w6,w14,w15
+ eor w9,w7,w13
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w12,w12,w6
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ eor w6,w14,w15
+ eor w9,w12,w8
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ ldp w7,w8,[x10],8
+ eor w13,w13,w6
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ eor w6,w12,w13
+ eor w9,w7,w15
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w14,w14,w6
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ eor w6,w12,w13
+ eor w9,w14,w8
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w15,w15,w6
+ subs w11,w11,#1
+ b.ne 10b
+ mov v5.s[0],w15
+ mov v5.s[1],w14
+ mov v5.s[2],w13
+ mov v5.s[3],w12
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+ eor v6.16b,v6.16b,v5.16b
+ mov x10,x3
+ mov w11,#8
+ mov w12,v6.s[0]
+ mov w13,v6.s[1]
+ mov w14,v6.s[2]
+ mov w15,v6.s[3]
+10:
+ ldp w7,w8,[x10],8
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ eor w6,w14,w15
+ eor w9,w7,w13
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w12,w12,w6
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ eor w6,w14,w15
+ eor w9,w12,w8
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ ldp w7,w8,[x10],8
+ eor w13,w13,w6
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ eor w6,w12,w13
+ eor w9,w7,w15
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w14,w14,w6
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ eor w6,w12,w13
+ eor w9,w14,w8
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w15,w15,w6
+ subs w11,w11,#1
+ b.ne 10b
+ mov v6.s[0],w15
+ mov v6.s[1],w14
+ mov v6.s[2],w13
+ mov v6.s[3],w12
+#ifndef __AARCH64EB__
+ rev32 v5.16b,v5.16b
+#endif
+ eor v7.16b,v7.16b,v6.16b
+ mov x10,x3
+ mov w11,#8
+ mov w12,v7.s[0]
+ mov w13,v7.s[1]
+ mov w14,v7.s[2]
+ mov w15,v7.s[3]
+10:
+ ldp w7,w8,[x10],8
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ eor w6,w14,w15
+ eor w9,w7,w13
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w12,w12,w6
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ eor w6,w14,w15
+ eor w9,w12,w8
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ ldp w7,w8,[x10],8
+ eor w13,w13,w6
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ eor w6,w12,w13
+ eor w9,w7,w15
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w14,w14,w6
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ eor w6,w12,w13
+ eor w9,w14,w8
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w15,w15,w6
+ subs w11,w11,#1
+ b.ne 10b
+ mov v7.s[0],w15
+ mov v7.s[1],w14
+ mov v7.s[2],w13
+ mov v7.s[3],w12
+#ifndef __AARCH64EB__
+ rev32 v6.16b,v6.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v7.16b,v7.16b
+#endif
+ orr v3.16b,v7.16b,v7.16b
+ st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64
+ subs w2,w2,#4
+ b.ne .Lcbc_4_blocks_enc
+ b 2f
+1:
+ subs w2,w2,#1
+ b.lt 2f
+ ld1 {v4.4s},[x0],#16
+ eor v3.16b,v3.16b,v4.16b
+#ifndef __AARCH64EB__
+ rev32 v3.16b,v3.16b
+#endif
+ mov x10,x3
+ mov w11,#8
+ mov w12,v3.s[0]
+ mov w13,v3.s[1]
+ mov w14,v3.s[2]
+ mov w15,v3.s[3]
+10:
+ ldp w7,w8,[x10],8
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ eor w6,w14,w15
+ eor w9,w7,w13
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w12,w12,w6
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ eor w6,w14,w15
+ eor w9,w12,w8
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ ldp w7,w8,[x10],8
+ eor w13,w13,w6
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ eor w6,w12,w13
+ eor w9,w7,w15
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w14,w14,w6
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ eor w6,w12,w13
+ eor w9,w14,w8
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w15,w15,w6
+ subs w11,w11,#1
+ b.ne 10b
+ mov v3.s[0],w15
+ mov v3.s[1],w14
+ mov v3.s[2],w13
+ mov v3.s[3],w12
+#ifndef __AARCH64EB__
+ rev32 v3.16b,v3.16b
+#endif
+ st1 {v3.4s},[x1],#16
+ b 1b
+2:
+ // save back IV
+ st1 {v3.4s},[x4]
+ ret
+
+.Ldec:
+ // decryption mode starts
+ AARCH64_SIGN_LINK_REGISTER
+ stp d8,d9,[sp,#-80]!
+ stp d10,d11,[sp,#16]
+ stp d12,d13,[sp,#32]
+ stp d14,d15,[sp,#48]
+ stp x29,x30,[sp,#64]
+.Lcbc_8_blocks_dec:
+ cmp w2,#8
+ b.lt 1f
+ ld4 {v4.4s,v5.4s,v6.4s,v7.4s},[x0]
+ add x10,x0,#64
+ ld4 {v8.4s,v9.4s,v10.4s,v11.4s},[x10]
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v5.16b,v5.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v6.16b,v6.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v7.16b,v7.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v8.16b,v8.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v9.16b,v9.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v10.16b,v10.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v11.16b,v11.16b
+#endif
+ bl _vpsm4_ex_enc_8blks
+ zip1 v8.4s,v0.4s,v1.4s
+ zip2 v9.4s,v0.4s,v1.4s
+ zip1 v10.4s,v2.4s,v3.4s
+ zip2 v11.4s,v2.4s,v3.4s
+ zip1 v0.2d,v8.2d,v10.2d
+ zip2 v1.2d,v8.2d,v10.2d
+ zip1 v2.2d,v9.2d,v11.2d
+ zip2 v3.2d,v9.2d,v11.2d
+ zip1 v8.4s,v4.4s,v5.4s
+ zip2 v9.4s,v4.4s,v5.4s
+ zip1 v10.4s,v6.4s,v7.4s
+ zip2 v11.4s,v6.4s,v7.4s
+ zip1 v4.2d,v8.2d,v10.2d
+ zip2 v5.2d,v8.2d,v10.2d
+ zip1 v6.2d,v9.2d,v11.2d
+ zip2 v7.2d,v9.2d,v11.2d
+ ld1 {v15.4s},[x4]
+ ld1 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64
+ // note ivec1 and vtmpx[3] are reusing the same register
+ // care needs to be taken to avoid conflict
+ eor v0.16b,v0.16b,v15.16b
+ ld1 {v12.4s,v13.4s,v14.4s,v15.4s},[x0],#64
+ eor v1.16b,v1.16b,v8.16b
+ eor v2.16b,v2.16b,v9.16b
+ eor v3.16b,v3.16b,v10.16b
+ // save back IV
+ st1 {v15.4s}, [x4]
+ eor v4.16b,v4.16b,v11.16b
+ eor v5.16b,v5.16b,v12.16b
+ eor v6.16b,v6.16b,v13.16b
+ eor v7.16b,v7.16b,v14.16b
+ st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
+ st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64
+ subs w2,w2,#8
+ b.gt .Lcbc_8_blocks_dec
+ b.eq 100f
+1:
+ ld1 {v15.4s},[x4]
+.Lcbc_4_blocks_dec:
+ cmp w2,#4
+ b.lt 1f
+ ld4 {v4.4s,v5.4s,v6.4s,v7.4s},[x0]
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v5.16b,v5.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v6.16b,v6.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v7.16b,v7.16b
+#endif
+ bl _vpsm4_ex_enc_4blks
+ ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
+ zip1 v8.4s,v0.4s,v1.4s
+ zip2 v9.4s,v0.4s,v1.4s
+ zip1 v10.4s,v2.4s,v3.4s
+ zip2 v11.4s,v2.4s,v3.4s
+ zip1 v0.2d,v8.2d,v10.2d
+ zip2 v1.2d,v8.2d,v10.2d
+ zip1 v2.2d,v9.2d,v11.2d
+ zip2 v3.2d,v9.2d,v11.2d
+ eor v0.16b,v0.16b,v15.16b
+ eor v1.16b,v1.16b,v4.16b
+ orr v15.16b,v7.16b,v7.16b
+ eor v2.16b,v2.16b,v5.16b
+ eor v3.16b,v3.16b,v6.16b
+ st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
+ subs w2,w2,#4
+ b.gt .Lcbc_4_blocks_dec
+ // save back IV
+ st1 {v7.4s}, [x4]
+ b 100f
+1: // last block
+ subs w2,w2,#1
+ b.lt 100f
+ b.gt 1f
+ ld1 {v4.4s},[x0],#16
+ // save back IV
+ st1 {v4.4s}, [x4]
+#ifndef __AARCH64EB__
+ rev32 v8.16b,v4.16b
+#else
+ mov v8.16b,v4.16b
+#endif
+ mov x10,x3
+ mov w11,#8
+ mov w12,v8.s[0]
+ mov w13,v8.s[1]
+ mov w14,v8.s[2]
+ mov w15,v8.s[3]
+10:
+ ldp w7,w8,[x10],8
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ eor w6,w14,w15
+ eor w9,w7,w13
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w12,w12,w6
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ eor w6,w14,w15
+ eor w9,w12,w8
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ ldp w7,w8,[x10],8
+ eor w13,w13,w6
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ eor w6,w12,w13
+ eor w9,w7,w15
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w14,w14,w6
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ eor w6,w12,w13
+ eor w9,w14,w8
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w15,w15,w6
+ subs w11,w11,#1
+ b.ne 10b
+ mov v8.s[0],w15
+ mov v8.s[1],w14
+ mov v8.s[2],w13
+ mov v8.s[3],w12
+#ifndef __AARCH64EB__
+ rev32 v8.16b,v8.16b
+#endif
+ eor v8.16b,v8.16b,v15.16b
+ st1 {v8.4s},[x1],#16
+ b 100f
+1: // last two blocks
+ ld4 {v4.s,v5.s,v6.s,v7.s}[0],[x0]
+ add x10,x0,#16
+ ld4 {v4.s,v5.s,v6.s,v7.s}[1],[x10],#16
+ subs w2,w2,1
+ b.gt 1f
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v5.16b,v5.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v6.16b,v6.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v7.16b,v7.16b
+#endif
+ bl _vpsm4_ex_enc_4blks
+ ld1 {v4.4s,v5.4s},[x0],#32
+ zip1 v8.4s,v0.4s,v1.4s
+ zip2 v9.4s,v0.4s,v1.4s
+ zip1 v10.4s,v2.4s,v3.4s
+ zip2 v11.4s,v2.4s,v3.4s
+ zip1 v0.2d,v8.2d,v10.2d
+ zip2 v1.2d,v8.2d,v10.2d
+ zip1 v2.2d,v9.2d,v11.2d
+ zip2 v3.2d,v9.2d,v11.2d
+ eor v0.16b,v0.16b,v15.16b
+ eor v1.16b,v1.16b,v4.16b
+ st1 {v0.4s,v1.4s},[x1],#32
+ // save back IV
+ st1 {v5.4s}, [x4]
+ b 100f
+1: // last 3 blocks
+ ld4 {v4.s,v5.s,v6.s,v7.s}[2],[x10]
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v5.16b,v5.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v6.16b,v6.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v7.16b,v7.16b
+#endif
+ bl _vpsm4_ex_enc_4blks
+ ld1 {v4.4s,v5.4s,v6.4s},[x0],#48
+ zip1 v8.4s,v0.4s,v1.4s
+ zip2 v9.4s,v0.4s,v1.4s
+ zip1 v10.4s,v2.4s,v3.4s
+ zip2 v11.4s,v2.4s,v3.4s
+ zip1 v0.2d,v8.2d,v10.2d
+ zip2 v1.2d,v8.2d,v10.2d
+ zip1 v2.2d,v9.2d,v11.2d
+ zip2 v3.2d,v9.2d,v11.2d
+ eor v0.16b,v0.16b,v15.16b
+ eor v1.16b,v1.16b,v4.16b
+ eor v2.16b,v2.16b,v5.16b
+ st1 {v0.4s,v1.4s,v2.4s},[x1],#48
+ // save back IV
+ st1 {v6.4s}, [x4]
+100:
+ ldp d10,d11,[sp,#16]
+ ldp d12,d13,[sp,#32]
+ ldp d14,d15,[sp,#48]
+ ldp x29,x30,[sp,#64]
+ ldp d8,d9,[sp],#80
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size vpsm4_ex_cbc_encrypt,.-vpsm4_ex_cbc_encrypt
+.globl vpsm4_ex_ctr32_encrypt_blocks
+.type vpsm4_ex_ctr32_encrypt_blocks,%function
+.align 5
+vpsm4_ex_ctr32_encrypt_blocks:
+ AARCH64_VALID_CALL_TARGET
+ ld1 {v3.4s},[x4]
+#ifndef __AARCH64EB__
+ rev32 v3.16b,v3.16b
+#endif
+ adrp x9, .Lsbox_magic
+ ldr q26, [x9, #:lo12:.Lsbox_magic]
+ ldr q27, [x9, #:lo12:.Lsbox_magic+16]
+ ldr q28, [x9, #:lo12:.Lsbox_magic+32]
+ ldr q29, [x9, #:lo12:.Lsbox_magic+48]
+ ldr q30, [x9, #:lo12:.Lsbox_magic+64]
+ ldr q31, [x9, #:lo12:.Lsbox_magic+80]
+ cmp w2,#1
+ b.ne 1f
+ // fast processing for one single block without
+ // context saving overhead
+ mov x10,x3
+ mov w11,#8
+ mov w12,v3.s[0]
+ mov w13,v3.s[1]
+ mov w14,v3.s[2]
+ mov w15,v3.s[3]
+10:
+ ldp w7,w8,[x10],8
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ eor w6,w14,w15
+ eor w9,w7,w13
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w12,w12,w6
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ eor w6,w14,w15
+ eor w9,w12,w8
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ ldp w7,w8,[x10],8
+ eor w13,w13,w6
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ eor w6,w12,w13
+ eor w9,w7,w15
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w14,w14,w6
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ eor w6,w12,w13
+ eor w9,w14,w8
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w15,w15,w6
+ subs w11,w11,#1
+ b.ne 10b
+ mov v3.s[0],w15
+ mov v3.s[1],w14
+ mov v3.s[2],w13
+ mov v3.s[3],w12
+#ifndef __AARCH64EB__
+ rev32 v3.16b,v3.16b
+#endif
+ ld1 {v4.4s},[x0]
+ eor v4.16b,v4.16b,v3.16b
+ st1 {v4.4s},[x1]
+ ret
+1:
+ AARCH64_SIGN_LINK_REGISTER
+ stp d8,d9,[sp,#-80]!
+ stp d10,d11,[sp,#16]
+ stp d12,d13,[sp,#32]
+ stp d14,d15,[sp,#48]
+ stp x29,x30,[sp,#64]
+ mov w12,v3.s[0]
+ mov w13,v3.s[1]
+ mov w14,v3.s[2]
+ mov w5,v3.s[3]
+.Lctr32_4_blocks_process:
+ cmp w2,#4
+ b.lt 1f
+ dup v4.4s,w12
+ dup v5.4s,w13
+ dup v6.4s,w14
+ mov v7.s[0],w5
+ add w5,w5,#1
+ mov v7.s[1],w5
+ add w5,w5,#1
+ mov v7.s[2],w5
+ add w5,w5,#1
+ mov v7.s[3],w5
+ add w5,w5,#1
+ cmp w2,#8
+ b.ge .Lctr32_8_blocks_process
+ bl _vpsm4_ex_enc_4blks
+ ld4 {v12.4s,v13.4s,v14.4s,v15.4s},[x0],#64
+ eor v0.16b,v0.16b,v12.16b
+ eor v1.16b,v1.16b,v13.16b
+ eor v2.16b,v2.16b,v14.16b
+ eor v3.16b,v3.16b,v15.16b
+ st4 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
+ subs w2,w2,#4
+ b.ne .Lctr32_4_blocks_process
+ b 100f
+.Lctr32_8_blocks_process:
+ dup v8.4s,w12
+ dup v9.4s,w13
+ dup v10.4s,w14
+ mov v11.s[0],w5
+ add w5,w5,#1
+ mov v11.s[1],w5
+ add w5,w5,#1
+ mov v11.s[2],w5
+ add w5,w5,#1
+ mov v11.s[3],w5
+ add w5,w5,#1
+ bl _vpsm4_ex_enc_8blks
+ ld4 {v12.4s,v13.4s,v14.4s,v15.4s},[x0],#64
+ ld4 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64
+ eor v0.16b,v0.16b,v12.16b
+ eor v1.16b,v1.16b,v13.16b
+ eor v2.16b,v2.16b,v14.16b
+ eor v3.16b,v3.16b,v15.16b
+ eor v4.16b,v4.16b,v8.16b
+ eor v5.16b,v5.16b,v9.16b
+ eor v6.16b,v6.16b,v10.16b
+ eor v7.16b,v7.16b,v11.16b
+ st4 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
+ st4 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64
+ subs w2,w2,#8
+ b.ne .Lctr32_4_blocks_process
+ b 100f
+1: // last block processing
+ subs w2,w2,#1
+ b.lt 100f
+ b.gt 1f
+ mov v3.s[0],w12
+ mov v3.s[1],w13
+ mov v3.s[2],w14
+ mov v3.s[3],w5
+ mov x10,x3
+ mov w11,#8
+ mov w12,v3.s[0]
+ mov w13,v3.s[1]
+ mov w14,v3.s[2]
+ mov w15,v3.s[3]
+10:
+ ldp w7,w8,[x10],8
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ eor w6,w14,w15
+ eor w9,w7,w13
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w12,w12,w6
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ eor w6,w14,w15
+ eor w9,w12,w8
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ ldp w7,w8,[x10],8
+ eor w13,w13,w6
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ eor w6,w12,w13
+ eor w9,w7,w15
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w14,w14,w6
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ eor w6,w12,w13
+ eor w9,w14,w8
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w15,w15,w6
+ subs w11,w11,#1
+ b.ne 10b
+ mov v3.s[0],w15
+ mov v3.s[1],w14
+ mov v3.s[2],w13
+ mov v3.s[3],w12
+#ifndef __AARCH64EB__
+ rev32 v3.16b,v3.16b
+#endif
+ ld1 {v4.4s},[x0]
+ eor v4.16b,v4.16b,v3.16b
+ st1 {v4.4s},[x1]
+ b 100f
+1: // last 2 blocks processing
+ dup v4.4s,w12
+ dup v5.4s,w13
+ dup v6.4s,w14
+ mov v7.s[0],w5
+ add w5,w5,#1
+ mov v7.s[1],w5
+ subs w2,w2,#1
+ b.ne 1f
+ bl _vpsm4_ex_enc_4blks
+ ld4 {v12.s,v13.s,v14.s,v15.s}[0],[x0],#16
+ ld4 {v12.s,v13.s,v14.s,v15.s}[1],[x0],#16
+ eor v0.16b,v0.16b,v12.16b
+ eor v1.16b,v1.16b,v13.16b
+ eor v2.16b,v2.16b,v14.16b
+ eor v3.16b,v3.16b,v15.16b
+ st4 {v0.s,v1.s,v2.s,v3.s}[0],[x1],#16
+ st4 {v0.s,v1.s,v2.s,v3.s}[1],[x1],#16
+ b 100f
+1: // last 3 blocks processing
+ add w5,w5,#1
+ mov v7.s[2],w5
+ bl _vpsm4_ex_enc_4blks
+ ld4 {v12.s,v13.s,v14.s,v15.s}[0],[x0],#16
+ ld4 {v12.s,v13.s,v14.s,v15.s}[1],[x0],#16
+ ld4 {v12.s,v13.s,v14.s,v15.s}[2],[x0],#16
+ eor v0.16b,v0.16b,v12.16b
+ eor v1.16b,v1.16b,v13.16b
+ eor v2.16b,v2.16b,v14.16b
+ eor v3.16b,v3.16b,v15.16b
+ st4 {v0.s,v1.s,v2.s,v3.s}[0],[x1],#16
+ st4 {v0.s,v1.s,v2.s,v3.s}[1],[x1],#16
+ st4 {v0.s,v1.s,v2.s,v3.s}[2],[x1],#16
+100:
+ ldp d10,d11,[sp,#16]
+ ldp d12,d13,[sp,#32]
+ ldp d14,d15,[sp,#48]
+ ldp x29,x30,[sp,#64]
+ ldp d8,d9,[sp],#80
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size vpsm4_ex_ctr32_encrypt_blocks,.-vpsm4_ex_ctr32_encrypt_blocks
+.globl vpsm4_ex_xts_encrypt_gb
+.type vpsm4_ex_xts_encrypt_gb,%function
+.align 5
+vpsm4_ex_xts_encrypt_gb:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x15, x16, [sp, #-0x10]!
+ stp x17, x18, [sp, #-0x10]!
+ stp x19, x20, [sp, #-0x10]!
+ stp x21, x22, [sp, #-0x10]!
+ stp x23, x24, [sp, #-0x10]!
+ stp x25, x26, [sp, #-0x10]!
+ stp x27, x28, [sp, #-0x10]!
+ stp x29, x30, [sp, #-0x10]!
+ stp d8, d9, [sp, #-0x10]!
+ stp d10, d11, [sp, #-0x10]!
+ stp d12, d13, [sp, #-0x10]!
+ stp d14, d15, [sp, #-0x10]!
+ mov x26,x3
+ mov x27,x4
+ mov w28,w6
+ ld1 {v16.4s}, [x5]
+ mov x3,x27
+ adrp x9, .Lsbox_magic
+ ldr q26, [x9, #:lo12:.Lsbox_magic]
+ ldr q27, [x9, #:lo12:.Lsbox_magic+16]
+ ldr q28, [x9, #:lo12:.Lsbox_magic+32]
+ ldr q29, [x9, #:lo12:.Lsbox_magic+48]
+ ldr q30, [x9, #:lo12:.Lsbox_magic+64]
+ ldr q31, [x9, #:lo12:.Lsbox_magic+80]
+#ifndef __AARCH64EB__
+ rev32 v16.16b,v16.16b
+#endif
+ mov x10,x3
+ mov w11,#8
+ mov w12,v16.s[0]
+ mov w13,v16.s[1]
+ mov w14,v16.s[2]
+ mov w15,v16.s[3]
+10:
+ ldp w7,w8,[x10],8
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ eor w6,w14,w15
+ eor w9,w7,w13
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w12,w12,w6
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ eor w6,w14,w15
+ eor w9,w12,w8
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ ldp w7,w8,[x10],8
+ eor w13,w13,w6
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ eor w6,w12,w13
+ eor w9,w7,w15
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w14,w14,w6
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ eor w6,w12,w13
+ eor w9,w14,w8
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w15,w15,w6
+ subs w11,w11,#1
+ b.ne 10b
+ mov v16.s[0],w15
+ mov v16.s[1],w14
+ mov v16.s[2],w13
+ mov v16.s[3],w12
+#ifndef __AARCH64EB__
+ rev32 v16.16b,v16.16b
+#endif
+ mov x3,x26
+ and x29,x2,#0x0F
+ // convert length into blocks
+ lsr x2,x2,4
+ cmp x2,#1
+ b.lt .return_gb
+
+ cmp x29,0
+ // If the encryption/decryption Length is N times of 16,
+ // the all blocks are encrypted/decrypted in .xts_encrypt_blocks_gb
+ b.eq .xts_encrypt_blocks_gb
+
+ // If the encryption/decryption length is not N times of 16,
+ // the last two blocks are encrypted/decrypted in .last_2blks_tweak_gb or .only_2blks_tweak_gb
+ // the other blocks are encrypted/decrypted in .xts_encrypt_blocks_gb
+ subs x2,x2,#1
+ b.eq .only_2blks_tweak_gb
+.xts_encrypt_blocks_gb:
+ rbit v16.16b,v16.16b
+#ifdef __AARCH64EB__
+ rev32 v16.16b,v16.16b
+#endif
+ mov x12,v16.d[0]
+ mov x13,v16.d[1]
+ mov w7,0x87
+ extr x9,x13,x13,#32
+ extr x15,x13,x12,#63
+ and w8,w7,w9,asr#31
+ eor x14,x8,x12,lsl#1
+ mov w7,0x87
+ extr x9,x15,x15,#32
+ extr x17,x15,x14,#63
+ and w8,w7,w9,asr#31
+ eor x16,x8,x14,lsl#1
+ mov w7,0x87
+ extr x9,x17,x17,#32
+ extr x19,x17,x16,#63
+ and w8,w7,w9,asr#31
+ eor x18,x8,x16,lsl#1
+ mov w7,0x87
+ extr x9,x19,x19,#32
+ extr x21,x19,x18,#63
+ and w8,w7,w9,asr#31
+ eor x20,x8,x18,lsl#1
+ mov w7,0x87
+ extr x9,x21,x21,#32
+ extr x23,x21,x20,#63
+ and w8,w7,w9,asr#31
+ eor x22,x8,x20,lsl#1
+ mov w7,0x87
+ extr x9,x23,x23,#32
+ extr x25,x23,x22,#63
+ and w8,w7,w9,asr#31
+ eor x24,x8,x22,lsl#1
+ mov w7,0x87
+ extr x9,x25,x25,#32
+ extr x27,x25,x24,#63
+ and w8,w7,w9,asr#31
+ eor x26,x8,x24,lsl#1
+.Lxts_8_blocks_process_gb:
+ cmp x2,#8
+ mov v16.d[0],x12
+ mov v16.d[1],x13
+#ifdef __AARCH64EB__
+ rev32 v16.16b,v16.16b
+#endif
+ mov w7,0x87
+ extr x9,x27,x27,#32
+ extr x13,x27,x26,#63
+ and w8,w7,w9,asr#31
+ eor x12,x8,x26,lsl#1
+ mov v17.d[0],x14
+ mov v17.d[1],x15
+#ifdef __AARCH64EB__
+ rev32 v17.16b,v17.16b
+#endif
+ mov w7,0x87
+ extr x9,x13,x13,#32
+ extr x15,x13,x12,#63
+ and w8,w7,w9,asr#31
+ eor x14,x8,x12,lsl#1
+ mov v18.d[0],x16
+ mov v18.d[1],x17
+#ifdef __AARCH64EB__
+ rev32 v18.16b,v18.16b
+#endif
+ mov w7,0x87
+ extr x9,x15,x15,#32
+ extr x17,x15,x14,#63
+ and w8,w7,w9,asr#31
+ eor x16,x8,x14,lsl#1
+ mov v19.d[0],x18
+ mov v19.d[1],x19
+#ifdef __AARCH64EB__
+ rev32 v19.16b,v19.16b
+#endif
+ mov w7,0x87
+ extr x9,x17,x17,#32
+ extr x19,x17,x16,#63
+ and w8,w7,w9,asr#31
+ eor x18,x8,x16,lsl#1
+ mov v20.d[0],x20
+ mov v20.d[1],x21
+#ifdef __AARCH64EB__
+ rev32 v20.16b,v20.16b
+#endif
+ mov w7,0x87
+ extr x9,x19,x19,#32
+ extr x21,x19,x18,#63
+ and w8,w7,w9,asr#31
+ eor x20,x8,x18,lsl#1
+ mov v21.d[0],x22
+ mov v21.d[1],x23
+#ifdef __AARCH64EB__
+ rev32 v21.16b,v21.16b
+#endif
+ mov w7,0x87
+ extr x9,x21,x21,#32
+ extr x23,x21,x20,#63
+ and w8,w7,w9,asr#31
+ eor x22,x8,x20,lsl#1
+ mov v22.d[0],x24
+ mov v22.d[1],x25
+#ifdef __AARCH64EB__
+ rev32 v22.16b,v22.16b
+#endif
+ mov w7,0x87
+ extr x9,x23,x23,#32
+ extr x25,x23,x22,#63
+ and w8,w7,w9,asr#31
+ eor x24,x8,x22,lsl#1
+ mov v23.d[0],x26
+ mov v23.d[1],x27
+#ifdef __AARCH64EB__
+ rev32 v23.16b,v23.16b
+#endif
+ mov w7,0x87
+ extr x9,x25,x25,#32
+ extr x27,x25,x24,#63
+ and w8,w7,w9,asr#31
+ eor x26,x8,x24,lsl#1
+ b.lt .Lxts_4_blocks_process_gb
+ ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
+ rbit v16.16b,v16.16b
+ rbit v17.16b,v17.16b
+ rbit v18.16b,v18.16b
+ rbit v19.16b,v19.16b
+ eor v4.16b, v4.16b, v16.16b
+ eor v5.16b, v5.16b, v17.16b
+ eor v6.16b, v6.16b, v18.16b
+ eor v7.16b, v7.16b, v19.16b
+ ld1 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64
+ rbit v20.16b,v20.16b
+ rbit v21.16b,v21.16b
+ rbit v22.16b,v22.16b
+ rbit v23.16b,v23.16b
+ eor v8.16b, v8.16b, v20.16b
+ eor v9.16b, v9.16b, v21.16b
+ eor v10.16b, v10.16b, v22.16b
+ eor v11.16b, v11.16b, v23.16b
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v5.16b,v5.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v6.16b,v6.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v7.16b,v7.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v8.16b,v8.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v9.16b,v9.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v10.16b,v10.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v11.16b,v11.16b
+#endif
+ zip1 v0.4s,v4.4s,v5.4s
+ zip2 v1.4s,v4.4s,v5.4s
+ zip1 v2.4s,v6.4s,v7.4s
+ zip2 v3.4s,v6.4s,v7.4s
+ zip1 v4.2d,v0.2d,v2.2d
+ zip2 v5.2d,v0.2d,v2.2d
+ zip1 v6.2d,v1.2d,v3.2d
+ zip2 v7.2d,v1.2d,v3.2d
+ zip1 v0.4s,v8.4s,v9.4s
+ zip2 v1.4s,v8.4s,v9.4s
+ zip1 v2.4s,v10.4s,v11.4s
+ zip2 v3.4s,v10.4s,v11.4s
+ zip1 v8.2d,v0.2d,v2.2d
+ zip2 v9.2d,v0.2d,v2.2d
+ zip1 v10.2d,v1.2d,v3.2d
+ zip2 v11.2d,v1.2d,v3.2d
+ bl _vpsm4_ex_enc_8blks
+ zip1 v8.4s,v0.4s,v1.4s
+ zip2 v9.4s,v0.4s,v1.4s
+ zip1 v10.4s,v2.4s,v3.4s
+ zip2 v11.4s,v2.4s,v3.4s
+ zip1 v0.2d,v8.2d,v10.2d
+ zip2 v1.2d,v8.2d,v10.2d
+ zip1 v2.2d,v9.2d,v11.2d
+ zip2 v3.2d,v9.2d,v11.2d
+ zip1 v8.4s,v4.4s,v5.4s
+ zip2 v9.4s,v4.4s,v5.4s
+ zip1 v10.4s,v6.4s,v7.4s
+ zip2 v11.4s,v6.4s,v7.4s
+ zip1 v4.2d,v8.2d,v10.2d
+ zip2 v5.2d,v8.2d,v10.2d
+ zip1 v6.2d,v9.2d,v11.2d
+ zip2 v7.2d,v9.2d,v11.2d
+ eor v0.16b, v0.16b, v16.16b
+ eor v1.16b, v1.16b, v17.16b
+ eor v2.16b, v2.16b, v18.16b
+ eor v3.16b, v3.16b, v19.16b
+ eor v4.16b, v4.16b, v20.16b
+ eor v5.16b, v5.16b, v21.16b
+ eor v6.16b, v6.16b, v22.16b
+ eor v7.16b, v7.16b, v23.16b
+
+ // save the last tweak
+ mov v25.16b,v23.16b
+ st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
+ st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64
+ subs x2,x2,#8
+ b.gt .Lxts_8_blocks_process_gb
+ b 100f
+.Lxts_4_blocks_process_gb:
+ cmp x2,#4
+ b.lt 1f
+ ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
+ rbit v16.16b,v16.16b
+ rbit v17.16b,v17.16b
+ rbit v18.16b,v18.16b
+ rbit v19.16b,v19.16b
+ eor v4.16b, v4.16b, v16.16b
+ eor v5.16b, v5.16b, v17.16b
+ eor v6.16b, v6.16b, v18.16b
+ eor v7.16b, v7.16b, v19.16b
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v5.16b,v5.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v6.16b,v6.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v7.16b,v7.16b
+#endif
+ zip1 v0.4s,v4.4s,v5.4s
+ zip2 v1.4s,v4.4s,v5.4s
+ zip1 v2.4s,v6.4s,v7.4s
+ zip2 v3.4s,v6.4s,v7.4s
+ zip1 v4.2d,v0.2d,v2.2d
+ zip2 v5.2d,v0.2d,v2.2d
+ zip1 v6.2d,v1.2d,v3.2d
+ zip2 v7.2d,v1.2d,v3.2d
+ bl _vpsm4_ex_enc_4blks
+ zip1 v4.4s,v0.4s,v1.4s
+ zip2 v5.4s,v0.4s,v1.4s
+ zip1 v6.4s,v2.4s,v3.4s
+ zip2 v7.4s,v2.4s,v3.4s
+ zip1 v0.2d,v4.2d,v6.2d
+ zip2 v1.2d,v4.2d,v6.2d
+ zip1 v2.2d,v5.2d,v7.2d
+ zip2 v3.2d,v5.2d,v7.2d
+ eor v0.16b, v0.16b, v16.16b
+ eor v1.16b, v1.16b, v17.16b
+ eor v2.16b, v2.16b, v18.16b
+ eor v3.16b, v3.16b, v19.16b
+ st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
+ sub x2,x2,#4
+ mov v16.16b,v20.16b
+ mov v17.16b,v21.16b
+ mov v18.16b,v22.16b
+ // save the last tweak
+ mov v25.16b,v19.16b
+1:
+ // process last block
+ cmp x2,#1
+ b.lt 100f
+ b.gt 1f
+ ld1 {v4.4s},[x0],#16
+ rbit v16.16b,v16.16b
+ eor v4.16b, v4.16b, v16.16b
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+ mov x10,x3
+ mov w11,#8
+ mov w12,v4.s[0]
+ mov w13,v4.s[1]
+ mov w14,v4.s[2]
+ mov w15,v4.s[3]
+10:
+ ldp w7,w8,[x10],8
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ eor w6,w14,w15
+ eor w9,w7,w13
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w12,w12,w6
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ eor w6,w14,w15
+ eor w9,w12,w8
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ ldp w7,w8,[x10],8
+ eor w13,w13,w6
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ eor w6,w12,w13
+ eor w9,w7,w15
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w14,w14,w6
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ eor w6,w12,w13
+ eor w9,w14,w8
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w15,w15,w6
+ subs w11,w11,#1
+ b.ne 10b
+ mov v4.s[0],w15
+ mov v4.s[1],w14
+ mov v4.s[2],w13
+ mov v4.s[3],w12
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+ eor v4.16b, v4.16b, v16.16b
+ st1 {v4.4s},[x1],#16
+ // save the last tweak
+ mov v25.16b,v16.16b
+ b 100f
+1: // process last 2 blocks
+ cmp x2,#2
+ b.gt 1f
+ ld1 {v4.4s,v5.4s},[x0],#32
+ rbit v16.16b,v16.16b
+ rbit v17.16b,v17.16b
+ eor v4.16b, v4.16b, v16.16b
+ eor v5.16b, v5.16b, v17.16b
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v5.16b,v5.16b
+#endif
+ zip1 v0.4s,v4.4s,v5.4s
+ zip2 v1.4s,v4.4s,v5.4s
+ zip1 v2.4s,v6.4s,v7.4s
+ zip2 v3.4s,v6.4s,v7.4s
+ zip1 v4.2d,v0.2d,v2.2d
+ zip2 v5.2d,v0.2d,v2.2d
+ zip1 v6.2d,v1.2d,v3.2d
+ zip2 v7.2d,v1.2d,v3.2d
+ bl _vpsm4_ex_enc_4blks
+ zip1 v4.4s,v0.4s,v1.4s
+ zip2 v5.4s,v0.4s,v1.4s
+ zip1 v6.4s,v2.4s,v3.4s
+ zip2 v7.4s,v2.4s,v3.4s
+ zip1 v0.2d,v4.2d,v6.2d
+ zip2 v1.2d,v4.2d,v6.2d
+ zip1 v2.2d,v5.2d,v7.2d
+ zip2 v3.2d,v5.2d,v7.2d
+ eor v0.16b, v0.16b, v16.16b
+ eor v1.16b, v1.16b, v17.16b
+ st1 {v0.4s,v1.4s},[x1],#32
+ // save the last tweak
+ mov v25.16b,v17.16b
+ b 100f
+1: // process last 3 blocks
+ ld1 {v4.4s,v5.4s,v6.4s},[x0],#48
+ rbit v16.16b,v16.16b
+ rbit v17.16b,v17.16b
+ rbit v18.16b,v18.16b
+ eor v4.16b, v4.16b, v16.16b
+ eor v5.16b, v5.16b, v17.16b
+ eor v6.16b, v6.16b, v18.16b
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v5.16b,v5.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v6.16b,v6.16b
+#endif
+ zip1 v0.4s,v4.4s,v5.4s
+ zip2 v1.4s,v4.4s,v5.4s
+ zip1 v2.4s,v6.4s,v7.4s
+ zip2 v3.4s,v6.4s,v7.4s
+ zip1 v4.2d,v0.2d,v2.2d
+ zip2 v5.2d,v0.2d,v2.2d
+ zip1 v6.2d,v1.2d,v3.2d
+ zip2 v7.2d,v1.2d,v3.2d
+ bl _vpsm4_ex_enc_4blks
+ zip1 v4.4s,v0.4s,v1.4s
+ zip2 v5.4s,v0.4s,v1.4s
+ zip1 v6.4s,v2.4s,v3.4s
+ zip2 v7.4s,v2.4s,v3.4s
+ zip1 v0.2d,v4.2d,v6.2d
+ zip2 v1.2d,v4.2d,v6.2d
+ zip1 v2.2d,v5.2d,v7.2d
+ zip2 v3.2d,v5.2d,v7.2d
+ eor v0.16b, v0.16b, v16.16b
+ eor v1.16b, v1.16b, v17.16b
+ eor v2.16b, v2.16b, v18.16b
+ st1 {v0.4s,v1.4s,v2.4s},[x1],#48
+ // save the last tweak
+ mov v25.16b,v18.16b
+100:
+ cmp x29,0
+ b.eq .return_gb
+
+// This branch calculates the last two tweaks,
+// while the encryption/decryption length is larger than 32
+.last_2blks_tweak_gb:
+#ifdef __AARCH64EB__
+ rev32 v25.16b,v25.16b
+#endif
+ rbit v2.16b,v25.16b
+ adrp x9, .Lxts_magic
+ ldr q0, [x9, #:lo12:.Lxts_magic]
+ shl v17.16b, v2.16b, #1
+ ext v1.16b, v2.16b, v2.16b,#15
+ ushr v1.16b, v1.16b, #7
+ mul v1.16b, v1.16b, v0.16b
+ eor v17.16b, v17.16b, v1.16b
+ rbit v17.16b,v17.16b
+ rbit v2.16b,v17.16b
+ adrp x9, .Lxts_magic
+ ldr q0, [x9, #:lo12:.Lxts_magic]
+ shl v18.16b, v2.16b, #1
+ ext v1.16b, v2.16b, v2.16b,#15
+ ushr v1.16b, v1.16b, #7
+ mul v1.16b, v1.16b, v0.16b
+ eor v18.16b, v18.16b, v1.16b
+ rbit v18.16b,v18.16b
+ b .check_dec_gb
+
+
+// This branch calculates the last two tweaks,
+// while the encryption/decryption length is equal to 32, who only need two tweaks
+.only_2blks_tweak_gb:
+ mov v17.16b,v16.16b
+#ifdef __AARCH64EB__
+ rev32 v17.16b,v17.16b
+#endif
+ rbit v2.16b,v17.16b
+ adrp x9, .Lxts_magic
+ ldr q0, [x9, #:lo12:.Lxts_magic]
+ shl v18.16b, v2.16b, #1
+ ext v1.16b, v2.16b, v2.16b,#15
+ ushr v1.16b, v1.16b, #7
+ mul v1.16b, v1.16b, v0.16b
+ eor v18.16b, v18.16b, v1.16b
+ rbit v18.16b,v18.16b
+ b .check_dec_gb
+
+
+// Determine whether encryption or decryption is required.
+// The last two tweaks need to be swapped for decryption.
+.check_dec_gb:
+ // encryption:1 decryption:0
+ cmp w28,1
+ b.eq .process_last_2blks_gb
+ mov v0.16B,v17.16b
+ mov v17.16B,v18.16b
+ mov v18.16B,v0.16b
+
+.process_last_2blks_gb:
+#ifdef __AARCH64EB__
+ rev32 v17.16b,v17.16b
+#endif
+#ifdef __AARCH64EB__
+ rev32 v18.16b,v18.16b
+#endif
+ ld1 {v4.4s},[x0],#16
+ eor v4.16b, v4.16b, v17.16b
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+ mov x10,x3
+ mov w11,#8
+ mov w12,v4.s[0]
+ mov w13,v4.s[1]
+ mov w14,v4.s[2]
+ mov w15,v4.s[3]
+10:
+ ldp w7,w8,[x10],8
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ eor w6,w14,w15
+ eor w9,w7,w13
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w12,w12,w6
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ eor w6,w14,w15
+ eor w9,w12,w8
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ ldp w7,w8,[x10],8
+ eor w13,w13,w6
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ eor w6,w12,w13
+ eor w9,w7,w15
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w14,w14,w6
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ eor w6,w12,w13
+ eor w9,w14,w8
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w15,w15,w6
+ subs w11,w11,#1
+ b.ne 10b
+ mov v4.s[0],w15
+ mov v4.s[1],w14
+ mov v4.s[2],w13
+ mov v4.s[3],w12
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+ eor v4.16b, v4.16b, v17.16b
+ st1 {v4.4s},[x1],#16
+
+ sub x26,x1,16
+.loop_gb:
+ subs x29,x29,1
+ ldrb w7,[x26,x29]
+ ldrb w8,[x0,x29]
+ strb w8,[x26,x29]
+ strb w7,[x1,x29]
+ b.gt .loop_gb
+ ld1 {v4.4s}, [x26]
+ eor v4.16b, v4.16b, v18.16b
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+ mov x10,x3
+ mov w11,#8
+ mov w12,v4.s[0]
+ mov w13,v4.s[1]
+ mov w14,v4.s[2]
+ mov w15,v4.s[3]
+10:
+ ldp w7,w8,[x10],8
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ eor w6,w14,w15
+ eor w9,w7,w13
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w12,w12,w6
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ eor w6,w14,w15
+ eor w9,w12,w8
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ ldp w7,w8,[x10],8
+ eor w13,w13,w6
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ eor w6,w12,w13
+ eor w9,w7,w15
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w14,w14,w6
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ eor w6,w12,w13
+ eor w9,w14,w8
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w15,w15,w6
+ subs w11,w11,#1
+ b.ne 10b
+ mov v4.s[0],w15
+ mov v4.s[1],w14
+ mov v4.s[2],w13
+ mov v4.s[3],w12
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+ eor v4.16b, v4.16b, v18.16b
+ st1 {v4.4s}, [x26]
+.return_gb:
+ ldp d14, d15, [sp], #0x10
+ ldp d12, d13, [sp], #0x10
+ ldp d10, d11, [sp], #0x10
+ ldp d8, d9, [sp], #0x10
+ ldp x29, x30, [sp], #0x10
+ ldp x27, x28, [sp], #0x10
+ ldp x25, x26, [sp], #0x10
+ ldp x23, x24, [sp], #0x10
+ ldp x21, x22, [sp], #0x10
+ ldp x19, x20, [sp], #0x10
+ ldp x17, x18, [sp], #0x10
+ ldp x15, x16, [sp], #0x10
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size vpsm4_ex_xts_encrypt_gb,.-vpsm4_ex_xts_encrypt_gb
+.globl vpsm4_ex_xts_encrypt
+.type vpsm4_ex_xts_encrypt,%function
+.align 5
+vpsm4_ex_xts_encrypt:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x15, x16, [sp, #-0x10]!
+ stp x17, x18, [sp, #-0x10]!
+ stp x19, x20, [sp, #-0x10]!
+ stp x21, x22, [sp, #-0x10]!
+ stp x23, x24, [sp, #-0x10]!
+ stp x25, x26, [sp, #-0x10]!
+ stp x27, x28, [sp, #-0x10]!
+ stp x29, x30, [sp, #-0x10]!
+ stp d8, d9, [sp, #-0x10]!
+ stp d10, d11, [sp, #-0x10]!
+ stp d12, d13, [sp, #-0x10]!
+ stp d14, d15, [sp, #-0x10]!
+ mov x26,x3
+ mov x27,x4
+ mov w28,w6
+ ld1 {v16.4s}, [x5]
+ mov x3,x27
+ adrp x9, .Lsbox_magic
+ ldr q26, [x9, #:lo12:.Lsbox_magic]
+ ldr q27, [x9, #:lo12:.Lsbox_magic+16]
+ ldr q28, [x9, #:lo12:.Lsbox_magic+32]
+ ldr q29, [x9, #:lo12:.Lsbox_magic+48]
+ ldr q30, [x9, #:lo12:.Lsbox_magic+64]
+ ldr q31, [x9, #:lo12:.Lsbox_magic+80]
+#ifndef __AARCH64EB__
+ rev32 v16.16b,v16.16b
+#endif
+ mov x10,x3
+ mov w11,#8
+ mov w12,v16.s[0]
+ mov w13,v16.s[1]
+ mov w14,v16.s[2]
+ mov w15,v16.s[3]
+10:
+ ldp w7,w8,[x10],8
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ eor w6,w14,w15
+ eor w9,w7,w13
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w12,w12,w6
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ eor w6,w14,w15
+ eor w9,w12,w8
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ ldp w7,w8,[x10],8
+ eor w13,w13,w6
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ eor w6,w12,w13
+ eor w9,w7,w15
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w14,w14,w6
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ eor w6,w12,w13
+ eor w9,w14,w8
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w15,w15,w6
+ subs w11,w11,#1
+ b.ne 10b
+ mov v16.s[0],w15
+ mov v16.s[1],w14
+ mov v16.s[2],w13
+ mov v16.s[3],w12
+#ifndef __AARCH64EB__
+ rev32 v16.16b,v16.16b
+#endif
+ mov x3,x26
+ and x29,x2,#0x0F
+ // convert length into blocks
+ lsr x2,x2,4
+ cmp x2,#1
+ b.lt .return
+
+ cmp x29,0
+ // If the encryption/decryption Length is N times of 16,
+ // the all blocks are encrypted/decrypted in .xts_encrypt_blocks
+ b.eq .xts_encrypt_blocks
+
+ // If the encryption/decryption length is not N times of 16,
+ // the last two blocks are encrypted/decrypted in .last_2blks_tweak or .only_2blks_tweak
+ // the other blocks are encrypted/decrypted in .xts_encrypt_blocks
+ subs x2,x2,#1
+ b.eq .only_2blks_tweak
+.xts_encrypt_blocks:
+#ifdef __AARCH64EB__
+ rev32 v16.16b,v16.16b
+#endif
+ mov x12,v16.d[0]
+ mov x13,v16.d[1]
+ mov w7,0x87
+ extr x9,x13,x13,#32
+ extr x15,x13,x12,#63
+ and w8,w7,w9,asr#31
+ eor x14,x8,x12,lsl#1
+ mov w7,0x87
+ extr x9,x15,x15,#32
+ extr x17,x15,x14,#63
+ and w8,w7,w9,asr#31
+ eor x16,x8,x14,lsl#1
+ mov w7,0x87
+ extr x9,x17,x17,#32
+ extr x19,x17,x16,#63
+ and w8,w7,w9,asr#31
+ eor x18,x8,x16,lsl#1
+ mov w7,0x87
+ extr x9,x19,x19,#32
+ extr x21,x19,x18,#63
+ and w8,w7,w9,asr#31
+ eor x20,x8,x18,lsl#1
+ mov w7,0x87
+ extr x9,x21,x21,#32
+ extr x23,x21,x20,#63
+ and w8,w7,w9,asr#31
+ eor x22,x8,x20,lsl#1
+ mov w7,0x87
+ extr x9,x23,x23,#32
+ extr x25,x23,x22,#63
+ and w8,w7,w9,asr#31
+ eor x24,x8,x22,lsl#1
+ mov w7,0x87
+ extr x9,x25,x25,#32
+ extr x27,x25,x24,#63
+ and w8,w7,w9,asr#31
+ eor x26,x8,x24,lsl#1
+.Lxts_8_blocks_process:
+ cmp x2,#8
+ mov v16.d[0],x12
+ mov v16.d[1],x13
+#ifdef __AARCH64EB__
+ rev32 v16.16b,v16.16b
+#endif
+ mov w7,0x87
+ extr x9,x27,x27,#32
+ extr x13,x27,x26,#63
+ and w8,w7,w9,asr#31
+ eor x12,x8,x26,lsl#1
+ mov v17.d[0],x14
+ mov v17.d[1],x15
+#ifdef __AARCH64EB__
+ rev32 v17.16b,v17.16b
+#endif
+ mov w7,0x87
+ extr x9,x13,x13,#32
+ extr x15,x13,x12,#63
+ and w8,w7,w9,asr#31
+ eor x14,x8,x12,lsl#1
+ mov v18.d[0],x16
+ mov v18.d[1],x17
+#ifdef __AARCH64EB__
+ rev32 v18.16b,v18.16b
+#endif
+ mov w7,0x87
+ extr x9,x15,x15,#32
+ extr x17,x15,x14,#63
+ and w8,w7,w9,asr#31
+ eor x16,x8,x14,lsl#1
+ mov v19.d[0],x18
+ mov v19.d[1],x19
+#ifdef __AARCH64EB__
+ rev32 v19.16b,v19.16b
+#endif
+ mov w7,0x87
+ extr x9,x17,x17,#32
+ extr x19,x17,x16,#63
+ and w8,w7,w9,asr#31
+ eor x18,x8,x16,lsl#1
+ mov v20.d[0],x20
+ mov v20.d[1],x21
+#ifdef __AARCH64EB__
+ rev32 v20.16b,v20.16b
+#endif
+ mov w7,0x87
+ extr x9,x19,x19,#32
+ extr x21,x19,x18,#63
+ and w8,w7,w9,asr#31
+ eor x20,x8,x18,lsl#1
+ mov v21.d[0],x22
+ mov v21.d[1],x23
+#ifdef __AARCH64EB__
+ rev32 v21.16b,v21.16b
+#endif
+ mov w7,0x87
+ extr x9,x21,x21,#32
+ extr x23,x21,x20,#63
+ and w8,w7,w9,asr#31
+ eor x22,x8,x20,lsl#1
+ mov v22.d[0],x24
+ mov v22.d[1],x25
+#ifdef __AARCH64EB__
+ rev32 v22.16b,v22.16b
+#endif
+ mov w7,0x87
+ extr x9,x23,x23,#32
+ extr x25,x23,x22,#63
+ and w8,w7,w9,asr#31
+ eor x24,x8,x22,lsl#1
+ mov v23.d[0],x26
+ mov v23.d[1],x27
+#ifdef __AARCH64EB__
+ rev32 v23.16b,v23.16b
+#endif
+ mov w7,0x87
+ extr x9,x25,x25,#32
+ extr x27,x25,x24,#63
+ and w8,w7,w9,asr#31
+ eor x26,x8,x24,lsl#1
+ b.lt .Lxts_4_blocks_process
+ ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
+ eor v4.16b, v4.16b, v16.16b
+ eor v5.16b, v5.16b, v17.16b
+ eor v6.16b, v6.16b, v18.16b
+ eor v7.16b, v7.16b, v19.16b
+ ld1 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64
+ eor v8.16b, v8.16b, v20.16b
+ eor v9.16b, v9.16b, v21.16b
+ eor v10.16b, v10.16b, v22.16b
+ eor v11.16b, v11.16b, v23.16b
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v5.16b,v5.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v6.16b,v6.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v7.16b,v7.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v8.16b,v8.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v9.16b,v9.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v10.16b,v10.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v11.16b,v11.16b
+#endif
+ zip1 v0.4s,v4.4s,v5.4s
+ zip2 v1.4s,v4.4s,v5.4s
+ zip1 v2.4s,v6.4s,v7.4s
+ zip2 v3.4s,v6.4s,v7.4s
+ zip1 v4.2d,v0.2d,v2.2d
+ zip2 v5.2d,v0.2d,v2.2d
+ zip1 v6.2d,v1.2d,v3.2d
+ zip2 v7.2d,v1.2d,v3.2d
+ zip1 v0.4s,v8.4s,v9.4s
+ zip2 v1.4s,v8.4s,v9.4s
+ zip1 v2.4s,v10.4s,v11.4s
+ zip2 v3.4s,v10.4s,v11.4s
+ zip1 v8.2d,v0.2d,v2.2d
+ zip2 v9.2d,v0.2d,v2.2d
+ zip1 v10.2d,v1.2d,v3.2d
+ zip2 v11.2d,v1.2d,v3.2d
+ bl _vpsm4_ex_enc_8blks
+ zip1 v8.4s,v0.4s,v1.4s
+ zip2 v9.4s,v0.4s,v1.4s
+ zip1 v10.4s,v2.4s,v3.4s
+ zip2 v11.4s,v2.4s,v3.4s
+ zip1 v0.2d,v8.2d,v10.2d
+ zip2 v1.2d,v8.2d,v10.2d
+ zip1 v2.2d,v9.2d,v11.2d
+ zip2 v3.2d,v9.2d,v11.2d
+ zip1 v8.4s,v4.4s,v5.4s
+ zip2 v9.4s,v4.4s,v5.4s
+ zip1 v10.4s,v6.4s,v7.4s
+ zip2 v11.4s,v6.4s,v7.4s
+ zip1 v4.2d,v8.2d,v10.2d
+ zip2 v5.2d,v8.2d,v10.2d
+ zip1 v6.2d,v9.2d,v11.2d
+ zip2 v7.2d,v9.2d,v11.2d
+ eor v0.16b, v0.16b, v16.16b
+ eor v1.16b, v1.16b, v17.16b
+ eor v2.16b, v2.16b, v18.16b
+ eor v3.16b, v3.16b, v19.16b
+ eor v4.16b, v4.16b, v20.16b
+ eor v5.16b, v5.16b, v21.16b
+ eor v6.16b, v6.16b, v22.16b
+ eor v7.16b, v7.16b, v23.16b
+
+ // save the last tweak
+ mov v25.16b,v23.16b
+ st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
+ st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64
+ subs x2,x2,#8
+ b.gt .Lxts_8_blocks_process
+ b 100f
+.Lxts_4_blocks_process:
+ cmp x2,#4
+ b.lt 1f
+ ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
+ eor v4.16b, v4.16b, v16.16b
+ eor v5.16b, v5.16b, v17.16b
+ eor v6.16b, v6.16b, v18.16b
+ eor v7.16b, v7.16b, v19.16b
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v5.16b,v5.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v6.16b,v6.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v7.16b,v7.16b
+#endif
+ zip1 v0.4s,v4.4s,v5.4s
+ zip2 v1.4s,v4.4s,v5.4s
+ zip1 v2.4s,v6.4s,v7.4s
+ zip2 v3.4s,v6.4s,v7.4s
+ zip1 v4.2d,v0.2d,v2.2d
+ zip2 v5.2d,v0.2d,v2.2d
+ zip1 v6.2d,v1.2d,v3.2d
+ zip2 v7.2d,v1.2d,v3.2d
+ bl _vpsm4_ex_enc_4blks
+ zip1 v4.4s,v0.4s,v1.4s
+ zip2 v5.4s,v0.4s,v1.4s
+ zip1 v6.4s,v2.4s,v3.4s
+ zip2 v7.4s,v2.4s,v3.4s
+ zip1 v0.2d,v4.2d,v6.2d
+ zip2 v1.2d,v4.2d,v6.2d
+ zip1 v2.2d,v5.2d,v7.2d
+ zip2 v3.2d,v5.2d,v7.2d
+ eor v0.16b, v0.16b, v16.16b
+ eor v1.16b, v1.16b, v17.16b
+ eor v2.16b, v2.16b, v18.16b
+ eor v3.16b, v3.16b, v19.16b
+ st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
+ sub x2,x2,#4
+ mov v16.16b,v20.16b
+ mov v17.16b,v21.16b
+ mov v18.16b,v22.16b
+ // save the last tweak
+ mov v25.16b,v19.16b
+1:
+ // process last block
+ cmp x2,#1
+ b.lt 100f
+ b.gt 1f
+ ld1 {v4.4s},[x0],#16
+ eor v4.16b, v4.16b, v16.16b
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+ mov x10,x3
+ mov w11,#8
+ mov w12,v4.s[0]
+ mov w13,v4.s[1]
+ mov w14,v4.s[2]
+ mov w15,v4.s[3]
+10:
+ ldp w7,w8,[x10],8
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ eor w6,w14,w15
+ eor w9,w7,w13
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w12,w12,w6
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ eor w6,w14,w15
+ eor w9,w12,w8
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ ldp w7,w8,[x10],8
+ eor w13,w13,w6
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ eor w6,w12,w13
+ eor w9,w7,w15
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w14,w14,w6
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ eor w6,w12,w13
+ eor w9,w14,w8
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w15,w15,w6
+ subs w11,w11,#1
+ b.ne 10b
+ mov v4.s[0],w15
+ mov v4.s[1],w14
+ mov v4.s[2],w13
+ mov v4.s[3],w12
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+ eor v4.16b, v4.16b, v16.16b
+ st1 {v4.4s},[x1],#16
+ // save the last tweak
+ mov v25.16b,v16.16b
+ b 100f
+1: // process last 2 blocks
+ cmp x2,#2
+ b.gt 1f
+ ld1 {v4.4s,v5.4s},[x0],#32
+ eor v4.16b, v4.16b, v16.16b
+ eor v5.16b, v5.16b, v17.16b
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v5.16b,v5.16b
+#endif
+ zip1 v0.4s,v4.4s,v5.4s
+ zip2 v1.4s,v4.4s,v5.4s
+ zip1 v2.4s,v6.4s,v7.4s
+ zip2 v3.4s,v6.4s,v7.4s
+ zip1 v4.2d,v0.2d,v2.2d
+ zip2 v5.2d,v0.2d,v2.2d
+ zip1 v6.2d,v1.2d,v3.2d
+ zip2 v7.2d,v1.2d,v3.2d
+ bl _vpsm4_ex_enc_4blks
+ zip1 v4.4s,v0.4s,v1.4s
+ zip2 v5.4s,v0.4s,v1.4s
+ zip1 v6.4s,v2.4s,v3.4s
+ zip2 v7.4s,v2.4s,v3.4s
+ zip1 v0.2d,v4.2d,v6.2d
+ zip2 v1.2d,v4.2d,v6.2d
+ zip1 v2.2d,v5.2d,v7.2d
+ zip2 v3.2d,v5.2d,v7.2d
+ eor v0.16b, v0.16b, v16.16b
+ eor v1.16b, v1.16b, v17.16b
+ st1 {v0.4s,v1.4s},[x1],#32
+ // save the last tweak
+ mov v25.16b,v17.16b
+ b 100f
+1: // process last 3 blocks
+ ld1 {v4.4s,v5.4s,v6.4s},[x0],#48
+ eor v4.16b, v4.16b, v16.16b
+ eor v5.16b, v5.16b, v17.16b
+ eor v6.16b, v6.16b, v18.16b
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v5.16b,v5.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v6.16b,v6.16b
+#endif
+ zip1 v0.4s,v4.4s,v5.4s
+ zip2 v1.4s,v4.4s,v5.4s
+ zip1 v2.4s,v6.4s,v7.4s
+ zip2 v3.4s,v6.4s,v7.4s
+ zip1 v4.2d,v0.2d,v2.2d
+ zip2 v5.2d,v0.2d,v2.2d
+ zip1 v6.2d,v1.2d,v3.2d
+ zip2 v7.2d,v1.2d,v3.2d
+ bl _vpsm4_ex_enc_4blks
+ zip1 v4.4s,v0.4s,v1.4s
+ zip2 v5.4s,v0.4s,v1.4s
+ zip1 v6.4s,v2.4s,v3.4s
+ zip2 v7.4s,v2.4s,v3.4s
+ zip1 v0.2d,v4.2d,v6.2d
+ zip2 v1.2d,v4.2d,v6.2d
+ zip1 v2.2d,v5.2d,v7.2d
+ zip2 v3.2d,v5.2d,v7.2d
+ eor v0.16b, v0.16b, v16.16b
+ eor v1.16b, v1.16b, v17.16b
+ eor v2.16b, v2.16b, v18.16b
+ st1 {v0.4s,v1.4s,v2.4s},[x1],#48
+ // save the last tweak
+ mov v25.16b,v18.16b
+100:
+ cmp x29,0
+ b.eq .return
+
+// This branch calculates the last two tweaks,
+// while the encryption/decryption length is larger than 32
+.last_2blks_tweak:
+#ifdef __AARCH64EB__
+ rev32 v25.16b,v25.16b
+#endif
+ mov v2.16b,v25.16b
+ adrp x9, .Lxts_magic
+ ldr q0, [x9, #:lo12:.Lxts_magic]
+ shl v17.16b, v2.16b, #1
+ ext v1.16b, v2.16b, v2.16b,#15
+ ushr v1.16b, v1.16b, #7
+ mul v1.16b, v1.16b, v0.16b
+ eor v17.16b, v17.16b, v1.16b
+ mov v2.16b,v17.16b
+ adrp x9, .Lxts_magic
+ ldr q0, [x9, #:lo12:.Lxts_magic]
+ shl v18.16b, v2.16b, #1
+ ext v1.16b, v2.16b, v2.16b,#15
+ ushr v1.16b, v1.16b, #7
+ mul v1.16b, v1.16b, v0.16b
+ eor v18.16b, v18.16b, v1.16b
+ b .check_dec
+
+
+// This branch calculates the last two tweaks,
+// while the encryption/decryption length is equal to 32, who only need two tweaks
+.only_2blks_tweak:
+ mov v17.16b,v16.16b
+#ifdef __AARCH64EB__
+ rev32 v17.16b,v17.16b
+#endif
+ mov v2.16b,v17.16b
+ adrp x9, .Lxts_magic
+ ldr q0, [x9, #:lo12:.Lxts_magic]
+ shl v18.16b, v2.16b, #1
+ ext v1.16b, v2.16b, v2.16b,#15
+ ushr v1.16b, v1.16b, #7
+ mul v1.16b, v1.16b, v0.16b
+ eor v18.16b, v18.16b, v1.16b
+ b .check_dec
+
+
+// Determine whether encryption or decryption is required.
+// The last two tweaks need to be swapped for decryption.
+.check_dec:
+ // encryption:1 decryption:0
+ cmp w28,1
+ b.eq .process_last_2blks
+ mov v0.16B,v17.16b
+ mov v17.16B,v18.16b
+ mov v18.16B,v0.16b
+
+.process_last_2blks:
+#ifdef __AARCH64EB__
+ rev32 v17.16b,v17.16b
+#endif
+#ifdef __AARCH64EB__
+ rev32 v18.16b,v18.16b
+#endif
+ ld1 {v4.4s},[x0],#16
+ eor v4.16b, v4.16b, v17.16b
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+ mov x10,x3
+ mov w11,#8
+ mov w12,v4.s[0]
+ mov w13,v4.s[1]
+ mov w14,v4.s[2]
+ mov w15,v4.s[3]
+10:
+ ldp w7,w8,[x10],8
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ eor w6,w14,w15
+ eor w9,w7,w13
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w12,w12,w6
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ eor w6,w14,w15
+ eor w9,w12,w8
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ ldp w7,w8,[x10],8
+ eor w13,w13,w6
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ eor w6,w12,w13
+ eor w9,w7,w15
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w14,w14,w6
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ eor w6,w12,w13
+ eor w9,w14,w8
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w15,w15,w6
+ subs w11,w11,#1
+ b.ne 10b
+ mov v4.s[0],w15
+ mov v4.s[1],w14
+ mov v4.s[2],w13
+ mov v4.s[3],w12
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+ eor v4.16b, v4.16b, v17.16b
+ st1 {v4.4s},[x1],#16
+
+ sub x26,x1,16
+.loop:
+ subs x29,x29,1
+ ldrb w7,[x26,x29]
+ ldrb w8,[x0,x29]
+ strb w8,[x26,x29]
+ strb w7,[x1,x29]
+ b.gt .loop
+ ld1 {v4.4s}, [x26]
+ eor v4.16b, v4.16b, v18.16b
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+ mov x10,x3
+ mov w11,#8
+ mov w12,v4.s[0]
+ mov w13,v4.s[1]
+ mov w14,v4.s[2]
+ mov w15,v4.s[3]
+10:
+ ldp w7,w8,[x10],8
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ eor w6,w14,w15
+ eor w9,w7,w13
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w12,w12,w6
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ eor w6,w14,w15
+ eor w9,w12,w8
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ ldp w7,w8,[x10],8
+ eor w13,w13,w6
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ eor w6,w12,w13
+ eor w9,w7,w15
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w14,w14,w6
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ eor w6,w12,w13
+ eor w9,w14,w8
+ eor w6,w6,w9
+ mov v3.s[0],w6
+ // optimize sbox using AESE instruction
+ tbl v0.16b, {v3.16b}, v26.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v28.16b}, v0.16b
+ tbl v2.16b, {v27.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+ eor v1.16b, v1.16b, v1.16b
+ aese v0.16b,v1.16b
+ ushr v2.16b, v0.16b, 4
+ and v0.16b, v0.16b, v31.16b
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v2.16b, {v29.16b}, v2.16b
+ eor v0.16b, v0.16b, v2.16b
+
+ mov w7,v0.s[0]
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w15,w15,w6
+ subs w11,w11,#1
+ b.ne 10b
+ mov v4.s[0],w15
+ mov v4.s[1],w14
+ mov v4.s[2],w13
+ mov v4.s[3],w12
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+ eor v4.16b, v4.16b, v18.16b
+ st1 {v4.4s}, [x26]
+.return:
+ ldp d14, d15, [sp], #0x10
+ ldp d12, d13, [sp], #0x10
+ ldp d10, d11, [sp], #0x10
+ ldp d8, d9, [sp], #0x10
+ ldp x29, x30, [sp], #0x10
+ ldp x27, x28, [sp], #0x10
+ ldp x25, x26, [sp], #0x10
+ ldp x23, x24, [sp], #0x10
+ ldp x21, x22, [sp], #0x10
+ ldp x19, x20, [sp], #0x10
+ ldp x17, x18, [sp], #0x10
+ ldp x15, x16, [sp], #0x10
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size vpsm4_ex_xts_encrypt,.-vpsm4_ex_xts_encrypt