aboutsummaryrefslogtreecommitdiff
path: root/sys/crypto/openssl/aarch64/vpsm4-armv8.S
diff options
context:
space:
mode:
Diffstat (limited to 'sys/crypto/openssl/aarch64/vpsm4-armv8.S')
-rw-r--r--sys/crypto/openssl/aarch64/vpsm4-armv8.S5021
1 files changed, 5021 insertions, 0 deletions
diff --git a/sys/crypto/openssl/aarch64/vpsm4-armv8.S b/sys/crypto/openssl/aarch64/vpsm4-armv8.S
new file mode 100644
index 000000000000..830e0315a2be
--- /dev/null
+++ b/sys/crypto/openssl/aarch64/vpsm4-armv8.S
@@ -0,0 +1,5021 @@
+/* Do not modify. This file is auto-generated from vpsm4-armv8.pl. */
+// Copyright 2020-2025 The OpenSSL Project Authors. All Rights Reserved.
+//
+// Licensed under the Apache License 2.0 (the "License"). You may not use
+// this file except in compliance with the License. You can obtain a copy
+// in the file LICENSE in the source distribution or at
+// https://www.openssl.org/source/license.html
+
+//
+// This module implements SM4 with ASIMD on aarch64
+//
+// Feb 2022
+//
+
+// $output is the last argument if it looks like a file (it has an extension)
+// $flavour is the first argument if it doesn't look like a file
+#include "arm_arch.h"
+.arch armv8-a
+.text
+
+.section .rodata
+.type _vpsm4_consts,%object
+.align 7
+_vpsm4_consts:
+.Lsbox:
+.byte 0xD6,0x90,0xE9,0xFE,0xCC,0xE1,0x3D,0xB7,0x16,0xB6,0x14,0xC2,0x28,0xFB,0x2C,0x05
+.byte 0x2B,0x67,0x9A,0x76,0x2A,0xBE,0x04,0xC3,0xAA,0x44,0x13,0x26,0x49,0x86,0x06,0x99
+.byte 0x9C,0x42,0x50,0xF4,0x91,0xEF,0x98,0x7A,0x33,0x54,0x0B,0x43,0xED,0xCF,0xAC,0x62
+.byte 0xE4,0xB3,0x1C,0xA9,0xC9,0x08,0xE8,0x95,0x80,0xDF,0x94,0xFA,0x75,0x8F,0x3F,0xA6
+.byte 0x47,0x07,0xA7,0xFC,0xF3,0x73,0x17,0xBA,0x83,0x59,0x3C,0x19,0xE6,0x85,0x4F,0xA8
+.byte 0x68,0x6B,0x81,0xB2,0x71,0x64,0xDA,0x8B,0xF8,0xEB,0x0F,0x4B,0x70,0x56,0x9D,0x35
+.byte 0x1E,0x24,0x0E,0x5E,0x63,0x58,0xD1,0xA2,0x25,0x22,0x7C,0x3B,0x01,0x21,0x78,0x87
+.byte 0xD4,0x00,0x46,0x57,0x9F,0xD3,0x27,0x52,0x4C,0x36,0x02,0xE7,0xA0,0xC4,0xC8,0x9E
+.byte 0xEA,0xBF,0x8A,0xD2,0x40,0xC7,0x38,0xB5,0xA3,0xF7,0xF2,0xCE,0xF9,0x61,0x15,0xA1
+.byte 0xE0,0xAE,0x5D,0xA4,0x9B,0x34,0x1A,0x55,0xAD,0x93,0x32,0x30,0xF5,0x8C,0xB1,0xE3
+.byte 0x1D,0xF6,0xE2,0x2E,0x82,0x66,0xCA,0x60,0xC0,0x29,0x23,0xAB,0x0D,0x53,0x4E,0x6F
+.byte 0xD5,0xDB,0x37,0x45,0xDE,0xFD,0x8E,0x2F,0x03,0xFF,0x6A,0x72,0x6D,0x6C,0x5B,0x51
+.byte 0x8D,0x1B,0xAF,0x92,0xBB,0xDD,0xBC,0x7F,0x11,0xD9,0x5C,0x41,0x1F,0x10,0x5A,0xD8
+.byte 0x0A,0xC1,0x31,0x88,0xA5,0xCD,0x7B,0xBD,0x2D,0x74,0xD0,0x12,0xB8,0xE5,0xB4,0xB0
+.byte 0x89,0x69,0x97,0x4A,0x0C,0x96,0x77,0x7E,0x65,0xB9,0xF1,0x09,0xC5,0x6E,0xC6,0x84
+.byte 0x18,0xF0,0x7D,0xEC,0x3A,0xDC,0x4D,0x20,0x79,0xEE,0x5F,0x3E,0xD7,0xCB,0x39,0x48
+.Lck:
+.long 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269
+.long 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9
+.long 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249
+.long 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9
+.long 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229
+.long 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299
+.long 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209
+.long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279
+.Lfk:
+.quad 0x56aa3350a3b1bac6,0xb27022dc677d9197
+.Lshuffles:
+.quad 0x0B0A090807060504,0x030201000F0E0D0C
+.Lxts_magic:
+.quad 0x0101010101010187,0x0101010101010101
+
+.size _vpsm4_consts,.-_vpsm4_consts
+
+.previous
+
+.type _vpsm4_set_key,%function
+.align 4
+_vpsm4_set_key:
+ AARCH64_VALID_CALL_TARGET
+ ld1 {v5.4s},[x0]
+ adrp x10,.Lsbox
+ add x10,x10,#:lo12:.Lsbox
+ ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x10],#64
+ ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x10],#64
+ ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x10],#64
+ ld1 {v28.16b,v29.16b,v30.16b,v31.16b},[x10]
+#ifndef __AARCH64EB__
+ rev32 v5.16b,v5.16b
+#endif
+ adrp x5,.Lshuffles
+ add x5,x5,#:lo12:.Lshuffles
+ ld1 {v7.2d},[x5]
+ adrp x5,.Lfk
+ add x5,x5,#:lo12:.Lfk
+ ld1 {v6.2d},[x5]
+ eor v5.16b,v5.16b,v6.16b
+ mov x6,#32
+ adrp x5,.Lck
+ add x5,x5,#:lo12:.Lck
+ movi v0.16b,#64
+ cbnz w2,1f
+ add x1,x1,124
+1:
+ mov w7,v5.s[1]
+ ldr w8,[x5],#4
+ eor w8,w8,w7
+ mov w7,v5.s[2]
+ eor w8,w8,w7
+ mov w7,v5.s[3]
+ eor w8,w8,w7
+ // sbox lookup
+ mov v4.s[0],w8
+ tbl v1.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v4.16b
+ sub v4.16b,v4.16b,v0.16b
+ tbx v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v4.16b
+ sub v4.16b,v4.16b,v0.16b
+ tbx v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v4.16b
+ sub v4.16b,v4.16b,v0.16b
+ tbx v1.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v4.16b
+ mov w7,v1.s[0]
+ eor w8,w7,w7,ror #19
+ eor w8,w8,w7,ror #9
+ mov w7,v5.s[0]
+ eor w8,w8,w7
+ mov v5.s[0],w8
+ cbz w2,2f
+ str w8,[x1],#4
+ b 3f
+2:
+ str w8,[x1],#-4
+3:
+ tbl v5.16b,{v5.16b},v7.16b
+ subs x6,x6,#1
+ b.ne 1b
+ ret
+.size _vpsm4_set_key,.-_vpsm4_set_key
+.type _vpsm4_enc_4blks,%function
+.align 4
+_vpsm4_enc_4blks:
+ AARCH64_VALID_CALL_TARGET
+ mov x10,x3
+ mov w11,#8
+10:
+ ldp w7,w8,[x10],8
+ dup v12.4s,w7
+ dup v13.4s,w8
+
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ eor v14.16b,v6.16b,v7.16b
+ eor v12.16b,v5.16b,v12.16b
+ eor v12.16b,v14.16b,v12.16b
+ movi v0.16b,#64
+ movi v1.16b,#128
+ movi v2.16b,#192
+ sub v0.16b,v12.16b,v0.16b
+ sub v1.16b,v12.16b,v1.16b
+ sub v2.16b,v12.16b,v2.16b
+ tbl v12.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v12.16b
+ tbl v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b
+ tbl v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b
+ tbl v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b
+ add v0.2d,v0.2d,v1.2d
+ add v2.2d,v2.2d,v12.2d
+ add v12.2d,v0.2d,v2.2d
+
+ ushr v0.4s,v12.4s,32-2
+ sli v0.4s,v12.4s,2
+ ushr v2.4s,v12.4s,32-10
+ eor v1.16b,v0.16b,v12.16b
+ sli v2.4s,v12.4s,10
+ eor v1.16b,v2.16b,v1.16b
+ ushr v0.4s,v12.4s,32-18
+ sli v0.4s,v12.4s,18
+ ushr v2.4s,v12.4s,32-24
+ eor v1.16b,v0.16b,v1.16b
+ sli v2.4s,v12.4s,24
+ eor v12.16b,v2.16b,v1.16b
+ eor v4.16b,v4.16b,v12.16b
+
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ eor v14.16b,v14.16b,v4.16b
+ eor v13.16b,v14.16b,v13.16b
+ movi v0.16b,#64
+ movi v1.16b,#128
+ movi v2.16b,#192
+ sub v0.16b,v13.16b,v0.16b
+ sub v1.16b,v13.16b,v1.16b
+ sub v2.16b,v13.16b,v2.16b
+ tbl v13.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v13.16b
+ tbl v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b
+ tbl v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b
+ tbl v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b
+ add v0.2d,v0.2d,v1.2d
+ add v2.2d,v2.2d,v13.2d
+ add v13.2d,v0.2d,v2.2d
+
+ ushr v0.4s,v13.4s,32-2
+ sli v0.4s,v13.4s,2
+ ushr v2.4s,v13.4s,32-10
+ eor v1.16b,v0.16b,v13.16b
+ sli v2.4s,v13.4s,10
+ eor v1.16b,v2.16b,v1.16b
+ ushr v0.4s,v13.4s,32-18
+ sli v0.4s,v13.4s,18
+ ushr v2.4s,v13.4s,32-24
+ eor v1.16b,v0.16b,v1.16b
+ sli v2.4s,v13.4s,24
+ eor v13.16b,v2.16b,v1.16b
+ ldp w7,w8,[x10],8
+ eor v5.16b,v5.16b,v13.16b
+
+ dup v12.4s,w7
+ dup v13.4s,w8
+
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ eor v14.16b,v4.16b,v5.16b
+ eor v12.16b,v7.16b,v12.16b
+ eor v12.16b,v14.16b,v12.16b
+ movi v0.16b,#64
+ movi v1.16b,#128
+ movi v2.16b,#192
+ sub v0.16b,v12.16b,v0.16b
+ sub v1.16b,v12.16b,v1.16b
+ sub v2.16b,v12.16b,v2.16b
+ tbl v12.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v12.16b
+ tbl v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b
+ tbl v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b
+ tbl v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b
+ add v0.2d,v0.2d,v1.2d
+ add v2.2d,v2.2d,v12.2d
+ add v12.2d,v0.2d,v2.2d
+
+ ushr v0.4s,v12.4s,32-2
+ sli v0.4s,v12.4s,2
+ ushr v2.4s,v12.4s,32-10
+ eor v1.16b,v0.16b,v12.16b
+ sli v2.4s,v12.4s,10
+ eor v1.16b,v2.16b,v1.16b
+ ushr v0.4s,v12.4s,32-18
+ sli v0.4s,v12.4s,18
+ ushr v2.4s,v12.4s,32-24
+ eor v1.16b,v0.16b,v1.16b
+ sli v2.4s,v12.4s,24
+ eor v12.16b,v2.16b,v1.16b
+ eor v6.16b,v6.16b,v12.16b
+
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ eor v14.16b,v14.16b,v6.16b
+ eor v13.16b,v14.16b,v13.16b
+ movi v0.16b,#64
+ movi v1.16b,#128
+ movi v2.16b,#192
+ sub v0.16b,v13.16b,v0.16b
+ sub v1.16b,v13.16b,v1.16b
+ sub v2.16b,v13.16b,v2.16b
+ tbl v13.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v13.16b
+ tbl v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b
+ tbl v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b
+ tbl v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b
+ add v0.2d,v0.2d,v1.2d
+ add v2.2d,v2.2d,v13.2d
+ add v13.2d,v0.2d,v2.2d
+
+ ushr v0.4s,v13.4s,32-2
+ sli v0.4s,v13.4s,2
+ ushr v2.4s,v13.4s,32-10
+ eor v1.16b,v0.16b,v13.16b
+ sli v2.4s,v13.4s,10
+ eor v1.16b,v2.16b,v1.16b
+ ushr v0.4s,v13.4s,32-18
+ sli v0.4s,v13.4s,18
+ ushr v2.4s,v13.4s,32-24
+ eor v1.16b,v0.16b,v1.16b
+ sli v2.4s,v13.4s,24
+ eor v13.16b,v2.16b,v1.16b
+ eor v7.16b,v7.16b,v13.16b
+ subs w11,w11,#1
+ b.ne 10b
+#ifndef __AARCH64EB__
+ rev32 v3.16b,v4.16b
+#else
+ mov v3.16b,v4.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v2.16b,v5.16b
+#else
+ mov v2.16b,v5.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v1.16b,v6.16b
+#else
+ mov v1.16b,v6.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v0.16b,v7.16b
+#else
+ mov v0.16b,v7.16b
+#endif
+ ret
+.size _vpsm4_enc_4blks,.-_vpsm4_enc_4blks
+.type _vpsm4_enc_8blks,%function
+.align 4
+_vpsm4_enc_8blks:
+ AARCH64_VALID_CALL_TARGET
+ mov x10,x3
+ mov w11,#8
+10:
+ ldp w7,w8,[x10],8
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ dup v12.4s,w7
+ eor v14.16b,v6.16b,v7.16b
+ eor v15.16b,v10.16b,v11.16b
+ eor v0.16b,v5.16b,v12.16b
+ eor v1.16b,v9.16b,v12.16b
+ eor v12.16b,v14.16b,v0.16b
+ eor v13.16b,v15.16b,v1.16b
+ movi v3.16b,#64
+ sub v0.16b,v12.16b,v3.16b
+ sub v1.16b,v0.16b,v3.16b
+ sub v2.16b,v1.16b,v3.16b
+ tbl v12.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v12.16b
+ tbl v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b
+ tbl v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b
+ tbl v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b
+ add v1.2d,v0.2d,v1.2d
+ add v12.2d,v2.2d,v12.2d
+ add v12.2d,v1.2d,v12.2d
+
+ sub v0.16b,v13.16b,v3.16b
+ sub v1.16b,v0.16b,v3.16b
+ sub v2.16b,v1.16b,v3.16b
+ tbl v13.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v13.16b
+ tbl v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b
+ tbl v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b
+ tbl v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b
+ add v1.2d,v0.2d,v1.2d
+ add v13.2d,v2.2d,v13.2d
+ add v13.2d,v1.2d,v13.2d
+
+ ushr v0.4s,v12.4s,32-2
+ sli v0.4s,v12.4s,2
+ ushr v2.4s,v13.4s,32-2
+ eor v1.16b,v0.16b,v12.16b
+ sli v2.4s,v13.4s,2
+
+ ushr v0.4s,v12.4s,32-10
+ eor v3.16b,v2.16b,v13.16b
+ sli v0.4s,v12.4s,10
+ ushr v2.4s,v13.4s,32-10
+ eor v1.16b,v0.16b,v1.16b
+ sli v2.4s,v13.4s,10
+
+ ushr v0.4s,v12.4s,32-18
+ eor v3.16b,v2.16b,v3.16b
+ sli v0.4s,v12.4s,18
+ ushr v2.4s,v13.4s,32-18
+ eor v1.16b,v0.16b,v1.16b
+ sli v2.4s,v13.4s,18
+
+ ushr v0.4s,v12.4s,32-24
+ eor v3.16b,v2.16b,v3.16b
+ sli v0.4s,v12.4s,24
+ ushr v2.4s,v13.4s,32-24
+ eor v12.16b,v0.16b,v1.16b
+ sli v2.4s,v13.4s,24
+ eor v13.16b,v2.16b,v3.16b
+ eor v4.16b,v4.16b,v12.16b
+ eor v8.16b,v8.16b,v13.16b
+
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ dup v13.4s,w8
+ eor v14.16b,v14.16b,v4.16b
+ eor v15.16b,v15.16b,v8.16b
+ eor v12.16b,v14.16b,v13.16b
+ eor v13.16b,v15.16b,v13.16b
+ movi v3.16b,#64
+ sub v0.16b,v12.16b,v3.16b
+ sub v1.16b,v0.16b,v3.16b
+ sub v2.16b,v1.16b,v3.16b
+ tbl v12.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v12.16b
+ tbl v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b
+ tbl v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b
+ tbl v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b
+ add v1.2d,v0.2d,v1.2d
+ add v12.2d,v2.2d,v12.2d
+ add v12.2d,v1.2d,v12.2d
+
+ sub v0.16b,v13.16b,v3.16b
+ sub v1.16b,v0.16b,v3.16b
+ sub v2.16b,v1.16b,v3.16b
+ tbl v13.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v13.16b
+ tbl v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b
+ tbl v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b
+ tbl v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b
+ add v1.2d,v0.2d,v1.2d
+ add v13.2d,v2.2d,v13.2d
+ add v13.2d,v1.2d,v13.2d
+
+ ushr v0.4s,v12.4s,32-2
+ sli v0.4s,v12.4s,2
+ ushr v2.4s,v13.4s,32-2
+ eor v1.16b,v0.16b,v12.16b
+ sli v2.4s,v13.4s,2
+
+ ushr v0.4s,v12.4s,32-10
+ eor v3.16b,v2.16b,v13.16b
+ sli v0.4s,v12.4s,10
+ ushr v2.4s,v13.4s,32-10
+ eor v1.16b,v0.16b,v1.16b
+ sli v2.4s,v13.4s,10
+
+ ushr v0.4s,v12.4s,32-18
+ eor v3.16b,v2.16b,v3.16b
+ sli v0.4s,v12.4s,18
+ ushr v2.4s,v13.4s,32-18
+ eor v1.16b,v0.16b,v1.16b
+ sli v2.4s,v13.4s,18
+
+ ushr v0.4s,v12.4s,32-24
+ eor v3.16b,v2.16b,v3.16b
+ sli v0.4s,v12.4s,24
+ ushr v2.4s,v13.4s,32-24
+ eor v12.16b,v0.16b,v1.16b
+ sli v2.4s,v13.4s,24
+ eor v13.16b,v2.16b,v3.16b
+ ldp w7,w8,[x10],8
+ eor v5.16b,v5.16b,v12.16b
+ eor v9.16b,v9.16b,v13.16b
+
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ dup v12.4s,w7
+ eor v14.16b,v4.16b,v5.16b
+ eor v15.16b,v8.16b,v9.16b
+ eor v0.16b,v7.16b,v12.16b
+ eor v1.16b,v11.16b,v12.16b
+ eor v12.16b,v14.16b,v0.16b
+ eor v13.16b,v15.16b,v1.16b
+ movi v3.16b,#64
+ sub v0.16b,v12.16b,v3.16b
+ sub v1.16b,v0.16b,v3.16b
+ sub v2.16b,v1.16b,v3.16b
+ tbl v12.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v12.16b
+ tbl v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b
+ tbl v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b
+ tbl v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b
+ add v1.2d,v0.2d,v1.2d
+ add v12.2d,v2.2d,v12.2d
+ add v12.2d,v1.2d,v12.2d
+
+ sub v0.16b,v13.16b,v3.16b
+ sub v1.16b,v0.16b,v3.16b
+ sub v2.16b,v1.16b,v3.16b
+ tbl v13.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v13.16b
+ tbl v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b
+ tbl v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b
+ tbl v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b
+ add v1.2d,v0.2d,v1.2d
+ add v13.2d,v2.2d,v13.2d
+ add v13.2d,v1.2d,v13.2d
+
+ ushr v0.4s,v12.4s,32-2
+ sli v0.4s,v12.4s,2
+ ushr v2.4s,v13.4s,32-2
+ eor v1.16b,v0.16b,v12.16b
+ sli v2.4s,v13.4s,2
+
+ ushr v0.4s,v12.4s,32-10
+ eor v3.16b,v2.16b,v13.16b
+ sli v0.4s,v12.4s,10
+ ushr v2.4s,v13.4s,32-10
+ eor v1.16b,v0.16b,v1.16b
+ sli v2.4s,v13.4s,10
+
+ ushr v0.4s,v12.4s,32-18
+ eor v3.16b,v2.16b,v3.16b
+ sli v0.4s,v12.4s,18
+ ushr v2.4s,v13.4s,32-18
+ eor v1.16b,v0.16b,v1.16b
+ sli v2.4s,v13.4s,18
+
+ ushr v0.4s,v12.4s,32-24
+ eor v3.16b,v2.16b,v3.16b
+ sli v0.4s,v12.4s,24
+ ushr v2.4s,v13.4s,32-24
+ eor v12.16b,v0.16b,v1.16b
+ sli v2.4s,v13.4s,24
+ eor v13.16b,v2.16b,v3.16b
+ eor v6.16b,v6.16b,v12.16b
+ eor v10.16b,v10.16b,v13.16b
+
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ dup v13.4s,w8
+ eor v14.16b,v14.16b,v6.16b
+ eor v15.16b,v15.16b,v10.16b
+ eor v12.16b,v14.16b,v13.16b
+ eor v13.16b,v15.16b,v13.16b
+ movi v3.16b,#64
+ sub v0.16b,v12.16b,v3.16b
+ sub v1.16b,v0.16b,v3.16b
+ sub v2.16b,v1.16b,v3.16b
+ tbl v12.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v12.16b
+ tbl v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b
+ tbl v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b
+ tbl v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b
+ add v1.2d,v0.2d,v1.2d
+ add v12.2d,v2.2d,v12.2d
+ add v12.2d,v1.2d,v12.2d
+
+ sub v0.16b,v13.16b,v3.16b
+ sub v1.16b,v0.16b,v3.16b
+ sub v2.16b,v1.16b,v3.16b
+ tbl v13.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v13.16b
+ tbl v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b
+ tbl v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b
+ tbl v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b
+ add v1.2d,v0.2d,v1.2d
+ add v13.2d,v2.2d,v13.2d
+ add v13.2d,v1.2d,v13.2d
+
+ ushr v0.4s,v12.4s,32-2
+ sli v0.4s,v12.4s,2
+ ushr v2.4s,v13.4s,32-2
+ eor v1.16b,v0.16b,v12.16b
+ sli v2.4s,v13.4s,2
+
+ ushr v0.4s,v12.4s,32-10
+ eor v3.16b,v2.16b,v13.16b
+ sli v0.4s,v12.4s,10
+ ushr v2.4s,v13.4s,32-10
+ eor v1.16b,v0.16b,v1.16b
+ sli v2.4s,v13.4s,10
+
+ ushr v0.4s,v12.4s,32-18
+ eor v3.16b,v2.16b,v3.16b
+ sli v0.4s,v12.4s,18
+ ushr v2.4s,v13.4s,32-18
+ eor v1.16b,v0.16b,v1.16b
+ sli v2.4s,v13.4s,18
+
+ ushr v0.4s,v12.4s,32-24
+ eor v3.16b,v2.16b,v3.16b
+ sli v0.4s,v12.4s,24
+ ushr v2.4s,v13.4s,32-24
+ eor v12.16b,v0.16b,v1.16b
+ sli v2.4s,v13.4s,24
+ eor v13.16b,v2.16b,v3.16b
+ eor v7.16b,v7.16b,v12.16b
+ eor v11.16b,v11.16b,v13.16b
+ subs w11,w11,#1
+ b.ne 10b
+#ifndef __AARCH64EB__
+ rev32 v3.16b,v4.16b
+#else
+ mov v3.16b,v4.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v2.16b,v5.16b
+#else
+ mov v2.16b,v5.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v1.16b,v6.16b
+#else
+ mov v1.16b,v6.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v0.16b,v7.16b
+#else
+ mov v0.16b,v7.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v7.16b,v8.16b
+#else
+ mov v7.16b,v8.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v6.16b,v9.16b
+#else
+ mov v6.16b,v9.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v5.16b,v10.16b
+#else
+ mov v5.16b,v10.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v11.16b
+#else
+ mov v4.16b,v11.16b
+#endif
+ ret
+.size _vpsm4_enc_8blks,.-_vpsm4_enc_8blks
+.globl vpsm4_set_encrypt_key
+.type vpsm4_set_encrypt_key,%function
+.align 5
+vpsm4_set_encrypt_key:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-16]!
+ mov w2,1
+ bl _vpsm4_set_key
+ ldp x29,x30,[sp],#16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size vpsm4_set_encrypt_key,.-vpsm4_set_encrypt_key
+.globl vpsm4_set_decrypt_key
+.type vpsm4_set_decrypt_key,%function
+.align 5
+vpsm4_set_decrypt_key:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-16]!
+ mov w2,0
+ bl _vpsm4_set_key
+ ldp x29,x30,[sp],#16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size vpsm4_set_decrypt_key,.-vpsm4_set_decrypt_key
+.globl vpsm4_encrypt
+.type vpsm4_encrypt,%function
+.align 5
+vpsm4_encrypt:
+ AARCH64_VALID_CALL_TARGET
+ ld1 {v4.4s},[x0]
+ adrp x10,.Lsbox
+ add x10,x10,#:lo12:.Lsbox
+ ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x10],#64
+ ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x10],#64
+ ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x10],#64
+ ld1 {v28.16b,v29.16b,v30.16b,v31.16b},[x10]
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+ mov x3,x2
+ mov x10,x3
+ mov w11,#8
+ mov w12,v4.s[0]
+ mov w13,v4.s[1]
+ mov w14,v4.s[2]
+ mov w15,v4.s[3]
+10:
+ ldp w7,w8,[x10],8
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ eor w6,w14,w15
+ eor w9,w7,w13
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w12,w12,w6
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ eor w6,w14,w15
+ eor w9,w12,w8
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ ldp w7,w8,[x10],8
+ eor w13,w13,w6
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ eor w6,w12,w13
+ eor w9,w7,w15
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w14,w14,w6
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ eor w6,w12,w13
+ eor w9,w14,w8
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w15,w15,w6
+ subs w11,w11,#1
+ b.ne 10b
+ mov v4.s[0],w15
+ mov v4.s[1],w14
+ mov v4.s[2],w13
+ mov v4.s[3],w12
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+ st1 {v4.4s},[x1]
+ ret
+.size vpsm4_encrypt,.-vpsm4_encrypt
+.globl vpsm4_decrypt
+.type vpsm4_decrypt,%function
+.align 5
+vpsm4_decrypt:
+ AARCH64_VALID_CALL_TARGET
+ ld1 {v4.4s},[x0]
+ adrp x10,.Lsbox
+ add x10,x10,#:lo12:.Lsbox
+ ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x10],#64
+ ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x10],#64
+ ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x10],#64
+ ld1 {v28.16b,v29.16b,v30.16b,v31.16b},[x10]
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+ mov x3,x2
+ mov x10,x3
+ mov w11,#8
+ mov w12,v4.s[0]
+ mov w13,v4.s[1]
+ mov w14,v4.s[2]
+ mov w15,v4.s[3]
+10:
+ ldp w7,w8,[x10],8
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ eor w6,w14,w15
+ eor w9,w7,w13
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w12,w12,w6
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ eor w6,w14,w15
+ eor w9,w12,w8
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ ldp w7,w8,[x10],8
+ eor w13,w13,w6
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ eor w6,w12,w13
+ eor w9,w7,w15
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w14,w14,w6
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ eor w6,w12,w13
+ eor w9,w14,w8
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w15,w15,w6
+ subs w11,w11,#1
+ b.ne 10b
+ mov v4.s[0],w15
+ mov v4.s[1],w14
+ mov v4.s[2],w13
+ mov v4.s[3],w12
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+ st1 {v4.4s},[x1]
+ ret
+.size vpsm4_decrypt,.-vpsm4_decrypt
+.globl vpsm4_ecb_encrypt
+.type vpsm4_ecb_encrypt,%function
+.align 5
+vpsm4_ecb_encrypt:
+ AARCH64_SIGN_LINK_REGISTER
+ // convert length into blocks
+ lsr x2,x2,4
+ stp d8,d9,[sp,#-80]!
+ stp d10,d11,[sp,#16]
+ stp d12,d13,[sp,#32]
+ stp d14,d15,[sp,#48]
+ stp x29,x30,[sp,#64]
+ adrp x10,.Lsbox
+ add x10,x10,#:lo12:.Lsbox
+ ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x10],#64
+ ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x10],#64
+ ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x10],#64
+ ld1 {v28.16b,v29.16b,v30.16b,v31.16b},[x10]
+.Lecb_8_blocks_process:
+ cmp w2,#8
+ b.lt .Lecb_4_blocks_process
+ ld4 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
+ ld4 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v5.16b,v5.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v6.16b,v6.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v7.16b,v7.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v8.16b,v8.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v9.16b,v9.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v10.16b,v10.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v11.16b,v11.16b
+#endif
+ bl _vpsm4_enc_8blks
+ st4 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
+ st4 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64
+ subs w2,w2,#8
+ b.gt .Lecb_8_blocks_process
+ b 100f
+.Lecb_4_blocks_process:
+ cmp w2,#4
+ b.lt 1f
+ ld4 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v5.16b,v5.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v6.16b,v6.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v7.16b,v7.16b
+#endif
+ bl _vpsm4_enc_4blks
+ st4 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
+ sub w2,w2,#4
+1:
+ // process last block
+ cmp w2,#1
+ b.lt 100f
+ b.gt 1f
+ ld1 {v4.4s},[x0]
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+ mov x10,x3
+ mov w11,#8
+ mov w12,v4.s[0]
+ mov w13,v4.s[1]
+ mov w14,v4.s[2]
+ mov w15,v4.s[3]
+10:
+ ldp w7,w8,[x10],8
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ eor w6,w14,w15
+ eor w9,w7,w13
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w12,w12,w6
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ eor w6,w14,w15
+ eor w9,w12,w8
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ ldp w7,w8,[x10],8
+ eor w13,w13,w6
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ eor w6,w12,w13
+ eor w9,w7,w15
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w14,w14,w6
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ eor w6,w12,w13
+ eor w9,w14,w8
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w15,w15,w6
+ subs w11,w11,#1
+ b.ne 10b
+ mov v4.s[0],w15
+ mov v4.s[1],w14
+ mov v4.s[2],w13
+ mov v4.s[3],w12
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+ st1 {v4.4s},[x1]
+ b 100f
+1: // process last 2 blocks
+ ld4 {v4.s,v5.s,v6.s,v7.s}[0],[x0],#16
+ ld4 {v4.s,v5.s,v6.s,v7.s}[1],[x0],#16
+ cmp w2,#2
+ b.gt 1f
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v5.16b,v5.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v6.16b,v6.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v7.16b,v7.16b
+#endif
+ bl _vpsm4_enc_4blks
+ st4 {v0.s,v1.s,v2.s,v3.s}[0],[x1],#16
+ st4 {v0.s,v1.s,v2.s,v3.s}[1],[x1]
+ b 100f
+1: // process last 3 blocks
+ ld4 {v4.s,v5.s,v6.s,v7.s}[2],[x0],#16
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v5.16b,v5.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v6.16b,v6.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v7.16b,v7.16b
+#endif
+ bl _vpsm4_enc_4blks
+ st4 {v0.s,v1.s,v2.s,v3.s}[0],[x1],#16
+ st4 {v0.s,v1.s,v2.s,v3.s}[1],[x1],#16
+ st4 {v0.s,v1.s,v2.s,v3.s}[2],[x1]
+100:
+ ldp d10,d11,[sp,#16]
+ ldp d12,d13,[sp,#32]
+ ldp d14,d15,[sp,#48]
+ ldp x29,x30,[sp,#64]
+ ldp d8,d9,[sp],#80
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size vpsm4_ecb_encrypt,.-vpsm4_ecb_encrypt
+.globl vpsm4_cbc_encrypt
+.type vpsm4_cbc_encrypt,%function
+.align 5
+vpsm4_cbc_encrypt:
+ AARCH64_VALID_CALL_TARGET
+ lsr x2,x2,4
+ adrp x10,.Lsbox
+ add x10,x10,#:lo12:.Lsbox
+ ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x10],#64
+ ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x10],#64
+ ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x10],#64
+ ld1 {v28.16b,v29.16b,v30.16b,v31.16b},[x10]
+ cbz w5,.Ldec
+ ld1 {v3.4s},[x4]
+.Lcbc_4_blocks_enc:
+ cmp w2,#4
+ b.lt 1f
+ ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
+ eor v4.16b,v4.16b,v3.16b
+#ifndef __AARCH64EB__
+ rev32 v5.16b,v5.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v6.16b,v6.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v7.16b,v7.16b
+#endif
+ mov x10,x3
+ mov w11,#8
+ mov w12,v4.s[0]
+ mov w13,v4.s[1]
+ mov w14,v4.s[2]
+ mov w15,v4.s[3]
+10:
+ ldp w7,w8,[x10],8
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ eor w6,w14,w15
+ eor w9,w7,w13
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w12,w12,w6
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ eor w6,w14,w15
+ eor w9,w12,w8
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ ldp w7,w8,[x10],8
+ eor w13,w13,w6
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ eor w6,w12,w13
+ eor w9,w7,w15
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w14,w14,w6
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ eor w6,w12,w13
+ eor w9,w14,w8
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w15,w15,w6
+ subs w11,w11,#1
+ b.ne 10b
+ mov v4.s[0],w15
+ mov v4.s[1],w14
+ mov v4.s[2],w13
+ mov v4.s[3],w12
+ eor v5.16b,v5.16b,v4.16b
+ mov x10,x3
+ mov w11,#8
+ mov w12,v5.s[0]
+ mov w13,v5.s[1]
+ mov w14,v5.s[2]
+ mov w15,v5.s[3]
+10:
+ ldp w7,w8,[x10],8
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ eor w6,w14,w15
+ eor w9,w7,w13
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w12,w12,w6
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ eor w6,w14,w15
+ eor w9,w12,w8
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ ldp w7,w8,[x10],8
+ eor w13,w13,w6
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ eor w6,w12,w13
+ eor w9,w7,w15
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w14,w14,w6
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ eor w6,w12,w13
+ eor w9,w14,w8
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w15,w15,w6
+ subs w11,w11,#1
+ b.ne 10b
+ mov v5.s[0],w15
+ mov v5.s[1],w14
+ mov v5.s[2],w13
+ mov v5.s[3],w12
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+ eor v6.16b,v6.16b,v5.16b
+ mov x10,x3
+ mov w11,#8
+ mov w12,v6.s[0]
+ mov w13,v6.s[1]
+ mov w14,v6.s[2]
+ mov w15,v6.s[3]
+10:
+ ldp w7,w8,[x10],8
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ eor w6,w14,w15
+ eor w9,w7,w13
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w12,w12,w6
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ eor w6,w14,w15
+ eor w9,w12,w8
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ ldp w7,w8,[x10],8
+ eor w13,w13,w6
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ eor w6,w12,w13
+ eor w9,w7,w15
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w14,w14,w6
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ eor w6,w12,w13
+ eor w9,w14,w8
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w15,w15,w6
+ subs w11,w11,#1
+ b.ne 10b
+ mov v6.s[0],w15
+ mov v6.s[1],w14
+ mov v6.s[2],w13
+ mov v6.s[3],w12
+#ifndef __AARCH64EB__
+ rev32 v5.16b,v5.16b
+#endif
+ eor v7.16b,v7.16b,v6.16b
+ mov x10,x3
+ mov w11,#8
+ mov w12,v7.s[0]
+ mov w13,v7.s[1]
+ mov w14,v7.s[2]
+ mov w15,v7.s[3]
+10:
+ ldp w7,w8,[x10],8
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ eor w6,w14,w15
+ eor w9,w7,w13
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w12,w12,w6
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ eor w6,w14,w15
+ eor w9,w12,w8
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ ldp w7,w8,[x10],8
+ eor w13,w13,w6
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ eor w6,w12,w13
+ eor w9,w7,w15
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w14,w14,w6
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ eor w6,w12,w13
+ eor w9,w14,w8
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w15,w15,w6
+ subs w11,w11,#1
+ b.ne 10b
+ mov v7.s[0],w15
+ mov v7.s[1],w14
+ mov v7.s[2],w13
+ mov v7.s[3],w12
+#ifndef __AARCH64EB__
+ rev32 v6.16b,v6.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v7.16b,v7.16b
+#endif
+ orr v3.16b,v7.16b,v7.16b
+ st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64
+ subs w2,w2,#4
+ b.ne .Lcbc_4_blocks_enc
+ b 2f
+1:
+ subs w2,w2,#1
+ b.lt 2f
+ ld1 {v4.4s},[x0],#16
+ eor v3.16b,v3.16b,v4.16b
+#ifndef __AARCH64EB__
+ rev32 v3.16b,v3.16b
+#endif
+ mov x10,x3
+ mov w11,#8
+ mov w12,v3.s[0]
+ mov w13,v3.s[1]
+ mov w14,v3.s[2]
+ mov w15,v3.s[3]
+10:
+ ldp w7,w8,[x10],8
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ eor w6,w14,w15
+ eor w9,w7,w13
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w12,w12,w6
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ eor w6,w14,w15
+ eor w9,w12,w8
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ ldp w7,w8,[x10],8
+ eor w13,w13,w6
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ eor w6,w12,w13
+ eor w9,w7,w15
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w14,w14,w6
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ eor w6,w12,w13
+ eor w9,w14,w8
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w15,w15,w6
+ subs w11,w11,#1
+ b.ne 10b
+ mov v3.s[0],w15
+ mov v3.s[1],w14
+ mov v3.s[2],w13
+ mov v3.s[3],w12
+#ifndef __AARCH64EB__
+ rev32 v3.16b,v3.16b
+#endif
+ st1 {v3.4s},[x1],#16
+ b 1b
+2:
+ // save back IV
+ st1 {v3.4s},[x4]
+ ret
+
+.Ldec:
+ // decryption mode starts
+ AARCH64_SIGN_LINK_REGISTER
+ stp d8,d9,[sp,#-80]!
+ stp d10,d11,[sp,#16]
+ stp d12,d13,[sp,#32]
+ stp d14,d15,[sp,#48]
+ stp x29,x30,[sp,#64]
+.Lcbc_8_blocks_dec:
+ cmp w2,#8
+ b.lt 1f
+ ld4 {v4.4s,v5.4s,v6.4s,v7.4s},[x0]
+ add x10,x0,#64
+ ld4 {v8.4s,v9.4s,v10.4s,v11.4s},[x10]
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v5.16b,v5.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v6.16b,v6.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v7.16b,v7.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v8.16b,v8.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v9.16b,v9.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v10.16b,v10.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v11.16b,v11.16b
+#endif
+ bl _vpsm4_enc_8blks
+ zip1 v8.4s,v0.4s,v1.4s
+ zip2 v9.4s,v0.4s,v1.4s
+ zip1 v10.4s,v2.4s,v3.4s
+ zip2 v11.4s,v2.4s,v3.4s
+ zip1 v0.2d,v8.2d,v10.2d
+ zip2 v1.2d,v8.2d,v10.2d
+ zip1 v2.2d,v9.2d,v11.2d
+ zip2 v3.2d,v9.2d,v11.2d
+ zip1 v8.4s,v4.4s,v5.4s
+ zip2 v9.4s,v4.4s,v5.4s
+ zip1 v10.4s,v6.4s,v7.4s
+ zip2 v11.4s,v6.4s,v7.4s
+ zip1 v4.2d,v8.2d,v10.2d
+ zip2 v5.2d,v8.2d,v10.2d
+ zip1 v6.2d,v9.2d,v11.2d
+ zip2 v7.2d,v9.2d,v11.2d
+ ld1 {v15.4s},[x4]
+ ld1 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64
+ // note ivec1 and vtmpx[3] are reusing the same register
+ // care needs to be taken to avoid conflict
+ eor v0.16b,v0.16b,v15.16b
+ ld1 {v12.4s,v13.4s,v14.4s,v15.4s},[x0],#64
+ eor v1.16b,v1.16b,v8.16b
+ eor v2.16b,v2.16b,v9.16b
+ eor v3.16b,v3.16b,v10.16b
+ // save back IV
+ st1 {v15.4s}, [x4]
+ eor v4.16b,v4.16b,v11.16b
+ eor v5.16b,v5.16b,v12.16b
+ eor v6.16b,v6.16b,v13.16b
+ eor v7.16b,v7.16b,v14.16b
+ st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
+ st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64
+ subs w2,w2,#8
+ b.gt .Lcbc_8_blocks_dec
+ b.eq 100f
+1:
+ ld1 {v15.4s},[x4]
+.Lcbc_4_blocks_dec:
+ cmp w2,#4
+ b.lt 1f
+ ld4 {v4.4s,v5.4s,v6.4s,v7.4s},[x0]
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v5.16b,v5.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v6.16b,v6.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v7.16b,v7.16b
+#endif
+ bl _vpsm4_enc_4blks
+ ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
+ zip1 v8.4s,v0.4s,v1.4s
+ zip2 v9.4s,v0.4s,v1.4s
+ zip1 v10.4s,v2.4s,v3.4s
+ zip2 v11.4s,v2.4s,v3.4s
+ zip1 v0.2d,v8.2d,v10.2d
+ zip2 v1.2d,v8.2d,v10.2d
+ zip1 v2.2d,v9.2d,v11.2d
+ zip2 v3.2d,v9.2d,v11.2d
+ eor v0.16b,v0.16b,v15.16b
+ eor v1.16b,v1.16b,v4.16b
+ orr v15.16b,v7.16b,v7.16b
+ eor v2.16b,v2.16b,v5.16b
+ eor v3.16b,v3.16b,v6.16b
+ st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
+ subs w2,w2,#4
+ b.gt .Lcbc_4_blocks_dec
+ // save back IV
+ st1 {v7.4s}, [x4]
+ b 100f
+1: // last block
+ subs w2,w2,#1
+ b.lt 100f
+ b.gt 1f
+ ld1 {v4.4s},[x0],#16
+ // save back IV
+ st1 {v4.4s}, [x4]
+#ifndef __AARCH64EB__
+ rev32 v8.16b,v4.16b
+#else
+ mov v8.16b,v4.16b
+#endif
+ mov x10,x3
+ mov w11,#8
+ mov w12,v8.s[0]
+ mov w13,v8.s[1]
+ mov w14,v8.s[2]
+ mov w15,v8.s[3]
+10:
+ ldp w7,w8,[x10],8
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ eor w6,w14,w15
+ eor w9,w7,w13
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w12,w12,w6
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ eor w6,w14,w15
+ eor w9,w12,w8
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ ldp w7,w8,[x10],8
+ eor w13,w13,w6
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ eor w6,w12,w13
+ eor w9,w7,w15
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w14,w14,w6
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ eor w6,w12,w13
+ eor w9,w14,w8
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w15,w15,w6
+ subs w11,w11,#1
+ b.ne 10b
+ mov v8.s[0],w15
+ mov v8.s[1],w14
+ mov v8.s[2],w13
+ mov v8.s[3],w12
+#ifndef __AARCH64EB__
+ rev32 v8.16b,v8.16b
+#endif
+ eor v8.16b,v8.16b,v15.16b
+ st1 {v8.4s},[x1],#16
+ b 100f
+1: // last two blocks
+ ld4 {v4.s,v5.s,v6.s,v7.s}[0],[x0]
+ add x10,x0,#16
+ ld4 {v4.s,v5.s,v6.s,v7.s}[1],[x10],#16
+ subs w2,w2,1
+ b.gt 1f
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v5.16b,v5.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v6.16b,v6.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v7.16b,v7.16b
+#endif
+ bl _vpsm4_enc_4blks
+ ld1 {v4.4s,v5.4s},[x0],#32
+ zip1 v8.4s,v0.4s,v1.4s
+ zip2 v9.4s,v0.4s,v1.4s
+ zip1 v10.4s,v2.4s,v3.4s
+ zip2 v11.4s,v2.4s,v3.4s
+ zip1 v0.2d,v8.2d,v10.2d
+ zip2 v1.2d,v8.2d,v10.2d
+ zip1 v2.2d,v9.2d,v11.2d
+ zip2 v3.2d,v9.2d,v11.2d
+ eor v0.16b,v0.16b,v15.16b
+ eor v1.16b,v1.16b,v4.16b
+ st1 {v0.4s,v1.4s},[x1],#32
+ // save back IV
+ st1 {v5.4s}, [x4]
+ b 100f
+1: // last 3 blocks
+ ld4 {v4.s,v5.s,v6.s,v7.s}[2],[x10]
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v5.16b,v5.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v6.16b,v6.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v7.16b,v7.16b
+#endif
+ bl _vpsm4_enc_4blks
+ ld1 {v4.4s,v5.4s,v6.4s},[x0],#48
+ zip1 v8.4s,v0.4s,v1.4s
+ zip2 v9.4s,v0.4s,v1.4s
+ zip1 v10.4s,v2.4s,v3.4s
+ zip2 v11.4s,v2.4s,v3.4s
+ zip1 v0.2d,v8.2d,v10.2d
+ zip2 v1.2d,v8.2d,v10.2d
+ zip1 v2.2d,v9.2d,v11.2d
+ zip2 v3.2d,v9.2d,v11.2d
+ eor v0.16b,v0.16b,v15.16b
+ eor v1.16b,v1.16b,v4.16b
+ eor v2.16b,v2.16b,v5.16b
+ st1 {v0.4s,v1.4s,v2.4s},[x1],#48
+ // save back IV
+ st1 {v6.4s}, [x4]
+100:
+ ldp d10,d11,[sp,#16]
+ ldp d12,d13,[sp,#32]
+ ldp d14,d15,[sp,#48]
+ ldp x29,x30,[sp,#64]
+ ldp d8,d9,[sp],#80
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size vpsm4_cbc_encrypt,.-vpsm4_cbc_encrypt
+.globl vpsm4_ctr32_encrypt_blocks
+.type vpsm4_ctr32_encrypt_blocks,%function
+.align 5
+vpsm4_ctr32_encrypt_blocks:
+ AARCH64_VALID_CALL_TARGET
+ ld1 {v3.4s},[x4]
+#ifndef __AARCH64EB__
+ rev32 v3.16b,v3.16b
+#endif
+ adrp x10,.Lsbox
+ add x10,x10,#:lo12:.Lsbox
+ ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x10],#64
+ ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x10],#64
+ ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x10],#64
+ ld1 {v28.16b,v29.16b,v30.16b,v31.16b},[x10]
+ cmp w2,#1
+ b.ne 1f
+ // fast processing for one single block without
+ // context saving overhead
+ mov x10,x3
+ mov w11,#8
+ mov w12,v3.s[0]
+ mov w13,v3.s[1]
+ mov w14,v3.s[2]
+ mov w15,v3.s[3]
+10:
+ ldp w7,w8,[x10],8
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ eor w6,w14,w15
+ eor w9,w7,w13
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w12,w12,w6
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ eor w6,w14,w15
+ eor w9,w12,w8
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ ldp w7,w8,[x10],8
+ eor w13,w13,w6
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ eor w6,w12,w13
+ eor w9,w7,w15
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w14,w14,w6
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ eor w6,w12,w13
+ eor w9,w14,w8
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w15,w15,w6
+ subs w11,w11,#1
+ b.ne 10b
+ mov v3.s[0],w15
+ mov v3.s[1],w14
+ mov v3.s[2],w13
+ mov v3.s[3],w12
+#ifndef __AARCH64EB__
+ rev32 v3.16b,v3.16b
+#endif
+ ld1 {v4.4s},[x0]
+ eor v4.16b,v4.16b,v3.16b
+ st1 {v4.4s},[x1]
+ ret
+1:
+ AARCH64_SIGN_LINK_REGISTER
+ stp d8,d9,[sp,#-80]!
+ stp d10,d11,[sp,#16]
+ stp d12,d13,[sp,#32]
+ stp d14,d15,[sp,#48]
+ stp x29,x30,[sp,#64]
+ mov w12,v3.s[0]
+ mov w13,v3.s[1]
+ mov w14,v3.s[2]
+ mov w5,v3.s[3]
+.Lctr32_4_blocks_process:
+ cmp w2,#4
+ b.lt 1f
+ dup v4.4s,w12
+ dup v5.4s,w13
+ dup v6.4s,w14
+ mov v7.s[0],w5
+ add w5,w5,#1
+ mov v7.s[1],w5
+ add w5,w5,#1
+ mov v7.s[2],w5
+ add w5,w5,#1
+ mov v7.s[3],w5
+ add w5,w5,#1
+ cmp w2,#8
+ b.ge .Lctr32_8_blocks_process
+ bl _vpsm4_enc_4blks
+ ld4 {v12.4s,v13.4s,v14.4s,v15.4s},[x0],#64
+ eor v0.16b,v0.16b,v12.16b
+ eor v1.16b,v1.16b,v13.16b
+ eor v2.16b,v2.16b,v14.16b
+ eor v3.16b,v3.16b,v15.16b
+ st4 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
+ subs w2,w2,#4
+ b.ne .Lctr32_4_blocks_process
+ b 100f
+.Lctr32_8_blocks_process:
+ dup v8.4s,w12
+ dup v9.4s,w13
+ dup v10.4s,w14
+ mov v11.s[0],w5
+ add w5,w5,#1
+ mov v11.s[1],w5
+ add w5,w5,#1
+ mov v11.s[2],w5
+ add w5,w5,#1
+ mov v11.s[3],w5
+ add w5,w5,#1
+ bl _vpsm4_enc_8blks
+ ld4 {v12.4s,v13.4s,v14.4s,v15.4s},[x0],#64
+ ld4 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64
+ eor v0.16b,v0.16b,v12.16b
+ eor v1.16b,v1.16b,v13.16b
+ eor v2.16b,v2.16b,v14.16b
+ eor v3.16b,v3.16b,v15.16b
+ eor v4.16b,v4.16b,v8.16b
+ eor v5.16b,v5.16b,v9.16b
+ eor v6.16b,v6.16b,v10.16b
+ eor v7.16b,v7.16b,v11.16b
+ st4 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
+ st4 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64
+ subs w2,w2,#8
+ b.ne .Lctr32_4_blocks_process
+ b 100f
+1: // last block processing
+ subs w2,w2,#1
+ b.lt 100f
+ b.gt 1f
+ mov v3.s[0],w12
+ mov v3.s[1],w13
+ mov v3.s[2],w14
+ mov v3.s[3],w5
+ mov x10,x3
+ mov w11,#8
+ mov w12,v3.s[0]
+ mov w13,v3.s[1]
+ mov w14,v3.s[2]
+ mov w15,v3.s[3]
+10:
+ ldp w7,w8,[x10],8
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ eor w6,w14,w15
+ eor w9,w7,w13
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w12,w12,w6
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ eor w6,w14,w15
+ eor w9,w12,w8
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ ldp w7,w8,[x10],8
+ eor w13,w13,w6
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ eor w6,w12,w13
+ eor w9,w7,w15
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w14,w14,w6
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ eor w6,w12,w13
+ eor w9,w14,w8
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w15,w15,w6
+ subs w11,w11,#1
+ b.ne 10b
+ mov v3.s[0],w15
+ mov v3.s[1],w14
+ mov v3.s[2],w13
+ mov v3.s[3],w12
+#ifndef __AARCH64EB__
+ rev32 v3.16b,v3.16b
+#endif
+ ld1 {v4.4s},[x0]
+ eor v4.16b,v4.16b,v3.16b
+ st1 {v4.4s},[x1]
+ b 100f
+1: // last 2 blocks processing
+ dup v4.4s,w12
+ dup v5.4s,w13
+ dup v6.4s,w14
+ mov v7.s[0],w5
+ add w5,w5,#1
+ mov v7.s[1],w5
+ subs w2,w2,#1
+ b.ne 1f
+ bl _vpsm4_enc_4blks
+ ld4 {v12.s,v13.s,v14.s,v15.s}[0],[x0],#16
+ ld4 {v12.s,v13.s,v14.s,v15.s}[1],[x0],#16
+ eor v0.16b,v0.16b,v12.16b
+ eor v1.16b,v1.16b,v13.16b
+ eor v2.16b,v2.16b,v14.16b
+ eor v3.16b,v3.16b,v15.16b
+ st4 {v0.s,v1.s,v2.s,v3.s}[0],[x1],#16
+ st4 {v0.s,v1.s,v2.s,v3.s}[1],[x1],#16
+ b 100f
+1: // last 3 blocks processing
+ add w5,w5,#1
+ mov v7.s[2],w5
+ bl _vpsm4_enc_4blks
+ ld4 {v12.s,v13.s,v14.s,v15.s}[0],[x0],#16
+ ld4 {v12.s,v13.s,v14.s,v15.s}[1],[x0],#16
+ ld4 {v12.s,v13.s,v14.s,v15.s}[2],[x0],#16
+ eor v0.16b,v0.16b,v12.16b
+ eor v1.16b,v1.16b,v13.16b
+ eor v2.16b,v2.16b,v14.16b
+ eor v3.16b,v3.16b,v15.16b
+ st4 {v0.s,v1.s,v2.s,v3.s}[0],[x1],#16
+ st4 {v0.s,v1.s,v2.s,v3.s}[1],[x1],#16
+ st4 {v0.s,v1.s,v2.s,v3.s}[2],[x1],#16
+100:
+ ldp d10,d11,[sp,#16]
+ ldp d12,d13,[sp,#32]
+ ldp d14,d15,[sp,#48]
+ ldp x29,x30,[sp,#64]
+ ldp d8,d9,[sp],#80
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size vpsm4_ctr32_encrypt_blocks,.-vpsm4_ctr32_encrypt_blocks
+.globl vpsm4_xts_encrypt_gb
+.type vpsm4_xts_encrypt_gb,%function
+.align 5
+vpsm4_xts_encrypt_gb:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x15, x16, [sp, #-0x10]!
+ stp x17, x18, [sp, #-0x10]!
+ stp x19, x20, [sp, #-0x10]!
+ stp x21, x22, [sp, #-0x10]!
+ stp x23, x24, [sp, #-0x10]!
+ stp x25, x26, [sp, #-0x10]!
+ stp x27, x28, [sp, #-0x10]!
+ stp x29, x30, [sp, #-0x10]!
+ stp d8, d9, [sp, #-0x10]!
+ stp d10, d11, [sp, #-0x10]!
+ stp d12, d13, [sp, #-0x10]!
+ stp d14, d15, [sp, #-0x10]!
+ mov x26,x3
+ mov x27,x4
+ mov w28,w6
+ ld1 {v8.4s}, [x5]
+ mov x3,x27
+ adrp x10,.Lsbox
+ add x10,x10,#:lo12:.Lsbox
+ ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x10],#64
+ ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x10],#64
+ ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x10],#64
+ ld1 {v28.16b,v29.16b,v30.16b,v31.16b},[x10]
+#ifndef __AARCH64EB__
+ rev32 v8.16b,v8.16b
+#endif
+ mov x10,x3
+ mov w11,#8
+ mov w12,v8.s[0]
+ mov w13,v8.s[1]
+ mov w14,v8.s[2]
+ mov w15,v8.s[3]
+10:
+ ldp w7,w8,[x10],8
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ eor w6,w14,w15
+ eor w9,w7,w13
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w12,w12,w6
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ eor w6,w14,w15
+ eor w9,w12,w8
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ ldp w7,w8,[x10],8
+ eor w13,w13,w6
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ eor w6,w12,w13
+ eor w9,w7,w15
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w14,w14,w6
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ eor w6,w12,w13
+ eor w9,w14,w8
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w15,w15,w6
+ subs w11,w11,#1
+ b.ne 10b
+ mov v8.s[0],w15
+ mov v8.s[1],w14
+ mov v8.s[2],w13
+ mov v8.s[3],w12
+#ifndef __AARCH64EB__
+ rev32 v8.16b,v8.16b
+#endif
+ mov x3,x26
+ and x29,x2,#0x0F
+ // convert length into blocks
+ lsr x2,x2,4
+ cmp x2,#1
+ b.lt .return_gb
+
+ cmp x29,0
+ // If the encryption/decryption Length is N times of 16,
+ // the all blocks are encrypted/decrypted in .xts_encrypt_blocks_gb
+ b.eq .xts_encrypt_blocks_gb
+
+ // If the encryption/decryption length is not N times of 16,
+ // the last two blocks are encrypted/decrypted in .last_2blks_tweak_gb or .only_2blks_tweak_gb
+ // the other blocks are encrypted/decrypted in .xts_encrypt_blocks_gb
+ subs x2,x2,#1
+ b.eq .only_2blks_tweak_gb
+.xts_encrypt_blocks_gb:
+ rbit v8.16b,v8.16b
+#ifdef __AARCH64EB__
+ rev32 v8.16b,v8.16b
+#endif
+ mov x12,v8.d[0]
+ mov x13,v8.d[1]
+ mov w7,0x87
+ extr x9,x13,x13,#32
+ extr x15,x13,x12,#63
+ and w8,w7,w9,asr#31
+ eor x14,x8,x12,lsl#1
+ mov w7,0x87
+ extr x9,x15,x15,#32
+ extr x17,x15,x14,#63
+ and w8,w7,w9,asr#31
+ eor x16,x8,x14,lsl#1
+ mov w7,0x87
+ extr x9,x17,x17,#32
+ extr x19,x17,x16,#63
+ and w8,w7,w9,asr#31
+ eor x18,x8,x16,lsl#1
+ mov w7,0x87
+ extr x9,x19,x19,#32
+ extr x21,x19,x18,#63
+ and w8,w7,w9,asr#31
+ eor x20,x8,x18,lsl#1
+ mov w7,0x87
+ extr x9,x21,x21,#32
+ extr x23,x21,x20,#63
+ and w8,w7,w9,asr#31
+ eor x22,x8,x20,lsl#1
+ mov w7,0x87
+ extr x9,x23,x23,#32
+ extr x25,x23,x22,#63
+ and w8,w7,w9,asr#31
+ eor x24,x8,x22,lsl#1
+ mov w7,0x87
+ extr x9,x25,x25,#32
+ extr x27,x25,x24,#63
+ and w8,w7,w9,asr#31
+ eor x26,x8,x24,lsl#1
+.Lxts_8_blocks_process_gb:
+ cmp x2,#8
+ b.lt .Lxts_4_blocks_process_gb
+ mov v0.d[0],x12
+ mov v0.d[1],x13
+#ifdef __AARCH64EB__
+ rev32 v0.16b,v0.16b
+#endif
+ mov v1.d[0],x14
+ mov v1.d[1],x15
+#ifdef __AARCH64EB__
+ rev32 v1.16b,v1.16b
+#endif
+ mov v2.d[0],x16
+ mov v2.d[1],x17
+#ifdef __AARCH64EB__
+ rev32 v2.16b,v2.16b
+#endif
+ mov v3.d[0],x18
+ mov v3.d[1],x19
+#ifdef __AARCH64EB__
+ rev32 v3.16b,v3.16b
+#endif
+ mov v12.d[0],x20
+ mov v12.d[1],x21
+#ifdef __AARCH64EB__
+ rev32 v12.16b,v12.16b
+#endif
+ mov v13.d[0],x22
+ mov v13.d[1],x23
+#ifdef __AARCH64EB__
+ rev32 v13.16b,v13.16b
+#endif
+ mov v14.d[0],x24
+ mov v14.d[1],x25
+#ifdef __AARCH64EB__
+ rev32 v14.16b,v14.16b
+#endif
+ mov v15.d[0],x26
+ mov v15.d[1],x27
+#ifdef __AARCH64EB__
+ rev32 v15.16b,v15.16b
+#endif
+ ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
+ rbit v0.16b,v0.16b
+ rbit v1.16b,v1.16b
+ rbit v2.16b,v2.16b
+ rbit v3.16b,v3.16b
+ eor v4.16b, v4.16b, v0.16b
+ eor v5.16b, v5.16b, v1.16b
+ eor v6.16b, v6.16b, v2.16b
+ eor v7.16b, v7.16b, v3.16b
+ ld1 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64
+ rbit v12.16b,v12.16b
+ rbit v13.16b,v13.16b
+ rbit v14.16b,v14.16b
+ rbit v15.16b,v15.16b
+ eor v8.16b, v8.16b, v12.16b
+ eor v9.16b, v9.16b, v13.16b
+ eor v10.16b, v10.16b, v14.16b
+ eor v11.16b, v11.16b, v15.16b
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v5.16b,v5.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v6.16b,v6.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v7.16b,v7.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v8.16b,v8.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v9.16b,v9.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v10.16b,v10.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v11.16b,v11.16b
+#endif
+ zip1 v0.4s,v4.4s,v5.4s
+ zip2 v1.4s,v4.4s,v5.4s
+ zip1 v2.4s,v6.4s,v7.4s
+ zip2 v3.4s,v6.4s,v7.4s
+ zip1 v4.2d,v0.2d,v2.2d
+ zip2 v5.2d,v0.2d,v2.2d
+ zip1 v6.2d,v1.2d,v3.2d
+ zip2 v7.2d,v1.2d,v3.2d
+ zip1 v0.4s,v8.4s,v9.4s
+ zip2 v1.4s,v8.4s,v9.4s
+ zip1 v2.4s,v10.4s,v11.4s
+ zip2 v3.4s,v10.4s,v11.4s
+ zip1 v8.2d,v0.2d,v2.2d
+ zip2 v9.2d,v0.2d,v2.2d
+ zip1 v10.2d,v1.2d,v3.2d
+ zip2 v11.2d,v1.2d,v3.2d
+ bl _vpsm4_enc_8blks
+ zip1 v8.4s,v0.4s,v1.4s
+ zip2 v9.4s,v0.4s,v1.4s
+ zip1 v10.4s,v2.4s,v3.4s
+ zip2 v11.4s,v2.4s,v3.4s
+ zip1 v0.2d,v8.2d,v10.2d
+ zip2 v1.2d,v8.2d,v10.2d
+ zip1 v2.2d,v9.2d,v11.2d
+ zip2 v3.2d,v9.2d,v11.2d
+ zip1 v8.4s,v4.4s,v5.4s
+ zip2 v9.4s,v4.4s,v5.4s
+ zip1 v10.4s,v6.4s,v7.4s
+ zip2 v11.4s,v6.4s,v7.4s
+ zip1 v4.2d,v8.2d,v10.2d
+ zip2 v5.2d,v8.2d,v10.2d
+ zip1 v6.2d,v9.2d,v11.2d
+ zip2 v7.2d,v9.2d,v11.2d
+ mov v12.d[0],x12
+ mov v12.d[1],x13
+#ifdef __AARCH64EB__
+ rev32 v12.16b,v12.16b
+#endif
+ mov w7,0x87
+ extr x9,x27,x27,#32
+ extr x13,x27,x26,#63
+ and w8,w7,w9,asr#31
+ eor x12,x8,x26,lsl#1
+ mov v13.d[0],x14
+ mov v13.d[1],x15
+#ifdef __AARCH64EB__
+ rev32 v13.16b,v13.16b
+#endif
+ mov w7,0x87
+ extr x9,x13,x13,#32
+ extr x15,x13,x12,#63
+ and w8,w7,w9,asr#31
+ eor x14,x8,x12,lsl#1
+ mov v14.d[0],x16
+ mov v14.d[1],x17
+#ifdef __AARCH64EB__
+ rev32 v14.16b,v14.16b
+#endif
+ mov w7,0x87
+ extr x9,x15,x15,#32
+ extr x17,x15,x14,#63
+ and w8,w7,w9,asr#31
+ eor x16,x8,x14,lsl#1
+ mov v15.d[0],x18
+ mov v15.d[1],x19
+#ifdef __AARCH64EB__
+ rev32 v15.16b,v15.16b
+#endif
+ mov w7,0x87
+ extr x9,x17,x17,#32
+ extr x19,x17,x16,#63
+ and w8,w7,w9,asr#31
+ eor x18,x8,x16,lsl#1
+ mov v8.d[0],x20
+ mov v8.d[1],x21
+#ifdef __AARCH64EB__
+ rev32 v8.16b,v8.16b
+#endif
+ mov w7,0x87
+ extr x9,x19,x19,#32
+ extr x21,x19,x18,#63
+ and w8,w7,w9,asr#31
+ eor x20,x8,x18,lsl#1
+ mov v9.d[0],x22
+ mov v9.d[1],x23
+#ifdef __AARCH64EB__
+ rev32 v9.16b,v9.16b
+#endif
+ mov w7,0x87
+ extr x9,x21,x21,#32
+ extr x23,x21,x20,#63
+ and w8,w7,w9,asr#31
+ eor x22,x8,x20,lsl#1
+ mov v10.d[0],x24
+ mov v10.d[1],x25
+#ifdef __AARCH64EB__
+ rev32 v10.16b,v10.16b
+#endif
+ mov w7,0x87
+ extr x9,x23,x23,#32
+ extr x25,x23,x22,#63
+ and w8,w7,w9,asr#31
+ eor x24,x8,x22,lsl#1
+ mov v11.d[0],x26
+ mov v11.d[1],x27
+#ifdef __AARCH64EB__
+ rev32 v11.16b,v11.16b
+#endif
+ mov w7,0x87
+ extr x9,x25,x25,#32
+ extr x27,x25,x24,#63
+ and w8,w7,w9,asr#31
+ eor x26,x8,x24,lsl#1
+ eor v0.16b, v0.16b, v12.16b
+ eor v1.16b, v1.16b, v13.16b
+ eor v2.16b, v2.16b, v14.16b
+ eor v3.16b, v3.16b, v15.16b
+ eor v4.16b, v4.16b, v8.16b
+ eor v5.16b, v5.16b, v9.16b
+ eor v6.16b, v6.16b, v10.16b
+ eor v7.16b, v7.16b, v11.16b
+
+ // save the last tweak
+ st1 {v11.4s},[x5]
+ st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
+ st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64
+ subs x2,x2,#8
+ b.gt .Lxts_8_blocks_process_gb
+ b 100f
+.Lxts_4_blocks_process_gb:
+ mov v8.d[0],x12
+ mov v8.d[1],x13
+#ifdef __AARCH64EB__
+ rev32 v8.16b,v8.16b
+#endif
+ mov v9.d[0],x14
+ mov v9.d[1],x15
+#ifdef __AARCH64EB__
+ rev32 v9.16b,v9.16b
+#endif
+ mov v10.d[0],x16
+ mov v10.d[1],x17
+#ifdef __AARCH64EB__
+ rev32 v10.16b,v10.16b
+#endif
+ mov v11.d[0],x18
+ mov v11.d[1],x19
+#ifdef __AARCH64EB__
+ rev32 v11.16b,v11.16b
+#endif
+ cmp x2,#4
+ b.lt 1f
+ ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
+ rbit v8.16b,v8.16b
+ rbit v9.16b,v9.16b
+ rbit v10.16b,v10.16b
+ rbit v11.16b,v11.16b
+ eor v4.16b, v4.16b, v8.16b
+ eor v5.16b, v5.16b, v9.16b
+ eor v6.16b, v6.16b, v10.16b
+ eor v7.16b, v7.16b, v11.16b
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v5.16b,v5.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v6.16b,v6.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v7.16b,v7.16b
+#endif
+ zip1 v0.4s,v4.4s,v5.4s
+ zip2 v1.4s,v4.4s,v5.4s
+ zip1 v2.4s,v6.4s,v7.4s
+ zip2 v3.4s,v6.4s,v7.4s
+ zip1 v4.2d,v0.2d,v2.2d
+ zip2 v5.2d,v0.2d,v2.2d
+ zip1 v6.2d,v1.2d,v3.2d
+ zip2 v7.2d,v1.2d,v3.2d
+ bl _vpsm4_enc_4blks
+ zip1 v4.4s,v0.4s,v1.4s
+ zip2 v5.4s,v0.4s,v1.4s
+ zip1 v6.4s,v2.4s,v3.4s
+ zip2 v7.4s,v2.4s,v3.4s
+ zip1 v0.2d,v4.2d,v6.2d
+ zip2 v1.2d,v4.2d,v6.2d
+ zip1 v2.2d,v5.2d,v7.2d
+ zip2 v3.2d,v5.2d,v7.2d
+ eor v0.16b, v0.16b, v8.16b
+ eor v1.16b, v1.16b, v9.16b
+ eor v2.16b, v2.16b, v10.16b
+ eor v3.16b, v3.16b, v11.16b
+ st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
+ sub x2,x2,#4
+ mov v8.d[0],x20
+ mov v8.d[1],x21
+#ifdef __AARCH64EB__
+ rev32 v8.16b,v8.16b
+#endif
+ mov v9.d[0],x22
+ mov v9.d[1],x23
+#ifdef __AARCH64EB__
+ rev32 v9.16b,v9.16b
+#endif
+ mov v10.d[0],x24
+ mov v10.d[1],x25
+#ifdef __AARCH64EB__
+ rev32 v10.16b,v10.16b
+#endif
+ // save the last tweak
+ st1 {v11.4s},[x5]
+1:
+ // process last block
+ cmp x2,#1
+ b.lt 100f
+ b.gt 1f
+ ld1 {v4.4s},[x0],#16
+ rbit v8.16b,v8.16b
+ eor v4.16b, v4.16b, v8.16b
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+ mov x10,x3
+ mov w11,#8
+ mov w12,v4.s[0]
+ mov w13,v4.s[1]
+ mov w14,v4.s[2]
+ mov w15,v4.s[3]
+10:
+ ldp w7,w8,[x10],8
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ eor w6,w14,w15
+ eor w9,w7,w13
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w12,w12,w6
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ eor w6,w14,w15
+ eor w9,w12,w8
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ ldp w7,w8,[x10],8
+ eor w13,w13,w6
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ eor w6,w12,w13
+ eor w9,w7,w15
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w14,w14,w6
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ eor w6,w12,w13
+ eor w9,w14,w8
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w15,w15,w6
+ subs w11,w11,#1
+ b.ne 10b
+ mov v4.s[0],w15
+ mov v4.s[1],w14
+ mov v4.s[2],w13
+ mov v4.s[3],w12
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+ eor v4.16b, v4.16b, v8.16b
+ st1 {v4.4s},[x1],#16
+ // save the last tweak
+ st1 {v8.4s},[x5]
+ b 100f
+1: // process last 2 blocks
+ cmp x2,#2
+ b.gt 1f
+ ld1 {v4.4s,v5.4s},[x0],#32
+ rbit v8.16b,v8.16b
+ rbit v9.16b,v9.16b
+ eor v4.16b, v4.16b, v8.16b
+ eor v5.16b, v5.16b, v9.16b
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v5.16b,v5.16b
+#endif
+ zip1 v0.4s,v4.4s,v5.4s
+ zip2 v1.4s,v4.4s,v5.4s
+ zip1 v2.4s,v6.4s,v7.4s
+ zip2 v3.4s,v6.4s,v7.4s
+ zip1 v4.2d,v0.2d,v2.2d
+ zip2 v5.2d,v0.2d,v2.2d
+ zip1 v6.2d,v1.2d,v3.2d
+ zip2 v7.2d,v1.2d,v3.2d
+ bl _vpsm4_enc_4blks
+ zip1 v4.4s,v0.4s,v1.4s
+ zip2 v5.4s,v0.4s,v1.4s
+ zip1 v6.4s,v2.4s,v3.4s
+ zip2 v7.4s,v2.4s,v3.4s
+ zip1 v0.2d,v4.2d,v6.2d
+ zip2 v1.2d,v4.2d,v6.2d
+ zip1 v2.2d,v5.2d,v7.2d
+ zip2 v3.2d,v5.2d,v7.2d
+ eor v0.16b, v0.16b, v8.16b
+ eor v1.16b, v1.16b, v9.16b
+ st1 {v0.4s,v1.4s},[x1],#32
+ // save the last tweak
+ st1 {v9.4s},[x5]
+ b 100f
+1: // process last 3 blocks
+ ld1 {v4.4s,v5.4s,v6.4s},[x0],#48
+ rbit v8.16b,v8.16b
+ rbit v9.16b,v9.16b
+ rbit v10.16b,v10.16b
+ eor v4.16b, v4.16b, v8.16b
+ eor v5.16b, v5.16b, v9.16b
+ eor v6.16b, v6.16b, v10.16b
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v5.16b,v5.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v6.16b,v6.16b
+#endif
+ zip1 v0.4s,v4.4s,v5.4s
+ zip2 v1.4s,v4.4s,v5.4s
+ zip1 v2.4s,v6.4s,v7.4s
+ zip2 v3.4s,v6.4s,v7.4s
+ zip1 v4.2d,v0.2d,v2.2d
+ zip2 v5.2d,v0.2d,v2.2d
+ zip1 v6.2d,v1.2d,v3.2d
+ zip2 v7.2d,v1.2d,v3.2d
+ bl _vpsm4_enc_4blks
+ zip1 v4.4s,v0.4s,v1.4s
+ zip2 v5.4s,v0.4s,v1.4s
+ zip1 v6.4s,v2.4s,v3.4s
+ zip2 v7.4s,v2.4s,v3.4s
+ zip1 v0.2d,v4.2d,v6.2d
+ zip2 v1.2d,v4.2d,v6.2d
+ zip1 v2.2d,v5.2d,v7.2d
+ zip2 v3.2d,v5.2d,v7.2d
+ eor v0.16b, v0.16b, v8.16b
+ eor v1.16b, v1.16b, v9.16b
+ eor v2.16b, v2.16b, v10.16b
+ st1 {v0.4s,v1.4s,v2.4s},[x1],#48
+ // save the last tweak
+ st1 {v10.4s},[x5]
+100:
+ cmp x29,0
+ b.eq .return_gb
+
+// This branch calculates the last two tweaks,
+// while the encryption/decryption length is larger than 32
+.last_2blks_tweak_gb:
+ ld1 {v8.4s},[x5]
+#ifdef __AARCH64EB__
+ rev32 v8.16b,v8.16b
+#endif
+ rbit v2.16b,v8.16b
+ adrp x10,.Lxts_magic
+ ldr q0, [x10, #:lo12:.Lxts_magic]
+ shl v9.16b, v2.16b, #1
+ ext v1.16b, v2.16b, v2.16b,#15
+ ushr v1.16b, v1.16b, #7
+ mul v1.16b, v1.16b, v0.16b
+ eor v9.16b, v9.16b, v1.16b
+ rbit v9.16b,v9.16b
+ rbit v2.16b,v9.16b
+ adrp x10,.Lxts_magic
+ ldr q0, [x10, #:lo12:.Lxts_magic]
+ shl v10.16b, v2.16b, #1
+ ext v1.16b, v2.16b, v2.16b,#15
+ ushr v1.16b, v1.16b, #7
+ mul v1.16b, v1.16b, v0.16b
+ eor v10.16b, v10.16b, v1.16b
+ rbit v10.16b,v10.16b
+ b .check_dec_gb
+
+
+// This branch calculates the last two tweaks,
+// while the encryption/decryption length is equal to 32, who only need two tweaks
+.only_2blks_tweak_gb:
+ mov v9.16b,v8.16b
+#ifdef __AARCH64EB__
+ rev32 v9.16b,v9.16b
+#endif
+ rbit v2.16b,v9.16b
+ adrp x10,.Lxts_magic
+ ldr q0, [x10, #:lo12:.Lxts_magic]
+ shl v10.16b, v2.16b, #1
+ ext v1.16b, v2.16b, v2.16b,#15
+ ushr v1.16b, v1.16b, #7
+ mul v1.16b, v1.16b, v0.16b
+ eor v10.16b, v10.16b, v1.16b
+ rbit v10.16b,v10.16b
+ b .check_dec_gb
+
+
+// Determine whether encryption or decryption is required.
+// The last two tweaks need to be swapped for decryption.
+.check_dec_gb:
+ // encryption:1 decryption:0
+ cmp w28,1
+ b.eq .process_last_2blks_gb
+ mov v0.16B,v9.16b
+ mov v9.16B,v10.16b
+ mov v10.16B,v0.16b
+
+.process_last_2blks_gb:
+#ifdef __AARCH64EB__
+ rev32 v9.16b,v9.16b
+#endif
+#ifdef __AARCH64EB__
+ rev32 v10.16b,v10.16b
+#endif
+ ld1 {v4.4s},[x0],#16
+ eor v4.16b, v4.16b, v9.16b
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+ mov x10,x3
+ mov w11,#8
+ mov w12,v4.s[0]
+ mov w13,v4.s[1]
+ mov w14,v4.s[2]
+ mov w15,v4.s[3]
+10:
+ ldp w7,w8,[x10],8
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ eor w6,w14,w15
+ eor w9,w7,w13
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w12,w12,w6
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ eor w6,w14,w15
+ eor w9,w12,w8
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ ldp w7,w8,[x10],8
+ eor w13,w13,w6
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ eor w6,w12,w13
+ eor w9,w7,w15
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w14,w14,w6
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ eor w6,w12,w13
+ eor w9,w14,w8
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w15,w15,w6
+ subs w11,w11,#1
+ b.ne 10b
+ mov v4.s[0],w15
+ mov v4.s[1],w14
+ mov v4.s[2],w13
+ mov v4.s[3],w12
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+ eor v4.16b, v4.16b, v9.16b
+ st1 {v4.4s},[x1],#16
+
+ sub x26,x1,16
+.loop_gb:
+ subs x29,x29,1
+ ldrb w7,[x26,x29]
+ ldrb w8,[x0,x29]
+ strb w8,[x26,x29]
+ strb w7,[x1,x29]
+ b.gt .loop_gb
+ ld1 {v4.4s}, [x26]
+ eor v4.16b, v4.16b, v10.16b
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+ mov x10,x3
+ mov w11,#8
+ mov w12,v4.s[0]
+ mov w13,v4.s[1]
+ mov w14,v4.s[2]
+ mov w15,v4.s[3]
+10:
+ ldp w7,w8,[x10],8
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ eor w6,w14,w15
+ eor w9,w7,w13
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w12,w12,w6
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ eor w6,w14,w15
+ eor w9,w12,w8
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ ldp w7,w8,[x10],8
+ eor w13,w13,w6
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ eor w6,w12,w13
+ eor w9,w7,w15
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w14,w14,w6
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ eor w6,w12,w13
+ eor w9,w14,w8
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w15,w15,w6
+ subs w11,w11,#1
+ b.ne 10b
+ mov v4.s[0],w15
+ mov v4.s[1],w14
+ mov v4.s[2],w13
+ mov v4.s[3],w12
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+ eor v4.16b, v4.16b, v10.16b
+ st1 {v4.4s}, [x26]
+.return_gb:
+ ldp d14, d15, [sp], #0x10
+ ldp d12, d13, [sp], #0x10
+ ldp d10, d11, [sp], #0x10
+ ldp d8, d9, [sp], #0x10
+ ldp x29, x30, [sp], #0x10
+ ldp x27, x28, [sp], #0x10
+ ldp x25, x26, [sp], #0x10
+ ldp x23, x24, [sp], #0x10
+ ldp x21, x22, [sp], #0x10
+ ldp x19, x20, [sp], #0x10
+ ldp x17, x18, [sp], #0x10
+ ldp x15, x16, [sp], #0x10
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size vpsm4_xts_encrypt_gb,.-vpsm4_xts_encrypt_gb
+.globl vpsm4_xts_encrypt
+.type vpsm4_xts_encrypt,%function
+.align 5
+vpsm4_xts_encrypt:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x15, x16, [sp, #-0x10]!
+ stp x17, x18, [sp, #-0x10]!
+ stp x19, x20, [sp, #-0x10]!
+ stp x21, x22, [sp, #-0x10]!
+ stp x23, x24, [sp, #-0x10]!
+ stp x25, x26, [sp, #-0x10]!
+ stp x27, x28, [sp, #-0x10]!
+ stp x29, x30, [sp, #-0x10]!
+ stp d8, d9, [sp, #-0x10]!
+ stp d10, d11, [sp, #-0x10]!
+ stp d12, d13, [sp, #-0x10]!
+ stp d14, d15, [sp, #-0x10]!
+ mov x26,x3
+ mov x27,x4
+ mov w28,w6
+ ld1 {v8.4s}, [x5]
+ mov x3,x27
+ adrp x10,.Lsbox
+ add x10,x10,#:lo12:.Lsbox
+ ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x10],#64
+ ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x10],#64
+ ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x10],#64
+ ld1 {v28.16b,v29.16b,v30.16b,v31.16b},[x10]
+#ifndef __AARCH64EB__
+ rev32 v8.16b,v8.16b
+#endif
+ mov x10,x3
+ mov w11,#8
+ mov w12,v8.s[0]
+ mov w13,v8.s[1]
+ mov w14,v8.s[2]
+ mov w15,v8.s[3]
+10:
+ ldp w7,w8,[x10],8
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ eor w6,w14,w15
+ eor w9,w7,w13
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w12,w12,w6
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ eor w6,w14,w15
+ eor w9,w12,w8
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ ldp w7,w8,[x10],8
+ eor w13,w13,w6
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ eor w6,w12,w13
+ eor w9,w7,w15
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w14,w14,w6
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ eor w6,w12,w13
+ eor w9,w14,w8
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w15,w15,w6
+ subs w11,w11,#1
+ b.ne 10b
+ mov v8.s[0],w15
+ mov v8.s[1],w14
+ mov v8.s[2],w13
+ mov v8.s[3],w12
+#ifndef __AARCH64EB__
+ rev32 v8.16b,v8.16b
+#endif
+ mov x3,x26
+ and x29,x2,#0x0F
+ // convert length into blocks
+ lsr x2,x2,4
+ cmp x2,#1
+ b.lt .return
+
+ cmp x29,0
+ // If the encryption/decryption Length is N times of 16,
+ // the all blocks are encrypted/decrypted in .xts_encrypt_blocks
+ b.eq .xts_encrypt_blocks
+
+ // If the encryption/decryption length is not N times of 16,
+ // the last two blocks are encrypted/decrypted in .last_2blks_tweak or .only_2blks_tweak
+ // the other blocks are encrypted/decrypted in .xts_encrypt_blocks
+ subs x2,x2,#1
+ b.eq .only_2blks_tweak
+.xts_encrypt_blocks:
+#ifdef __AARCH64EB__
+ rev32 v8.16b,v8.16b
+#endif
+ mov x12,v8.d[0]
+ mov x13,v8.d[1]
+ mov w7,0x87
+ extr x9,x13,x13,#32
+ extr x15,x13,x12,#63
+ and w8,w7,w9,asr#31
+ eor x14,x8,x12,lsl#1
+ mov w7,0x87
+ extr x9,x15,x15,#32
+ extr x17,x15,x14,#63
+ and w8,w7,w9,asr#31
+ eor x16,x8,x14,lsl#1
+ mov w7,0x87
+ extr x9,x17,x17,#32
+ extr x19,x17,x16,#63
+ and w8,w7,w9,asr#31
+ eor x18,x8,x16,lsl#1
+ mov w7,0x87
+ extr x9,x19,x19,#32
+ extr x21,x19,x18,#63
+ and w8,w7,w9,asr#31
+ eor x20,x8,x18,lsl#1
+ mov w7,0x87
+ extr x9,x21,x21,#32
+ extr x23,x21,x20,#63
+ and w8,w7,w9,asr#31
+ eor x22,x8,x20,lsl#1
+ mov w7,0x87
+ extr x9,x23,x23,#32
+ extr x25,x23,x22,#63
+ and w8,w7,w9,asr#31
+ eor x24,x8,x22,lsl#1
+ mov w7,0x87
+ extr x9,x25,x25,#32
+ extr x27,x25,x24,#63
+ and w8,w7,w9,asr#31
+ eor x26,x8,x24,lsl#1
+.Lxts_8_blocks_process:
+ cmp x2,#8
+ b.lt .Lxts_4_blocks_process
+ mov v0.d[0],x12
+ mov v0.d[1],x13
+#ifdef __AARCH64EB__
+ rev32 v0.16b,v0.16b
+#endif
+ mov v1.d[0],x14
+ mov v1.d[1],x15
+#ifdef __AARCH64EB__
+ rev32 v1.16b,v1.16b
+#endif
+ mov v2.d[0],x16
+ mov v2.d[1],x17
+#ifdef __AARCH64EB__
+ rev32 v2.16b,v2.16b
+#endif
+ mov v3.d[0],x18
+ mov v3.d[1],x19
+#ifdef __AARCH64EB__
+ rev32 v3.16b,v3.16b
+#endif
+ mov v12.d[0],x20
+ mov v12.d[1],x21
+#ifdef __AARCH64EB__
+ rev32 v12.16b,v12.16b
+#endif
+ mov v13.d[0],x22
+ mov v13.d[1],x23
+#ifdef __AARCH64EB__
+ rev32 v13.16b,v13.16b
+#endif
+ mov v14.d[0],x24
+ mov v14.d[1],x25
+#ifdef __AARCH64EB__
+ rev32 v14.16b,v14.16b
+#endif
+ mov v15.d[0],x26
+ mov v15.d[1],x27
+#ifdef __AARCH64EB__
+ rev32 v15.16b,v15.16b
+#endif
+ ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
+ eor v4.16b, v4.16b, v0.16b
+ eor v5.16b, v5.16b, v1.16b
+ eor v6.16b, v6.16b, v2.16b
+ eor v7.16b, v7.16b, v3.16b
+ ld1 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64
+ eor v8.16b, v8.16b, v12.16b
+ eor v9.16b, v9.16b, v13.16b
+ eor v10.16b, v10.16b, v14.16b
+ eor v11.16b, v11.16b, v15.16b
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v5.16b,v5.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v6.16b,v6.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v7.16b,v7.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v8.16b,v8.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v9.16b,v9.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v10.16b,v10.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v11.16b,v11.16b
+#endif
+ zip1 v0.4s,v4.4s,v5.4s
+ zip2 v1.4s,v4.4s,v5.4s
+ zip1 v2.4s,v6.4s,v7.4s
+ zip2 v3.4s,v6.4s,v7.4s
+ zip1 v4.2d,v0.2d,v2.2d
+ zip2 v5.2d,v0.2d,v2.2d
+ zip1 v6.2d,v1.2d,v3.2d
+ zip2 v7.2d,v1.2d,v3.2d
+ zip1 v0.4s,v8.4s,v9.4s
+ zip2 v1.4s,v8.4s,v9.4s
+ zip1 v2.4s,v10.4s,v11.4s
+ zip2 v3.4s,v10.4s,v11.4s
+ zip1 v8.2d,v0.2d,v2.2d
+ zip2 v9.2d,v0.2d,v2.2d
+ zip1 v10.2d,v1.2d,v3.2d
+ zip2 v11.2d,v1.2d,v3.2d
+ bl _vpsm4_enc_8blks
+ zip1 v8.4s,v0.4s,v1.4s
+ zip2 v9.4s,v0.4s,v1.4s
+ zip1 v10.4s,v2.4s,v3.4s
+ zip2 v11.4s,v2.4s,v3.4s
+ zip1 v0.2d,v8.2d,v10.2d
+ zip2 v1.2d,v8.2d,v10.2d
+ zip1 v2.2d,v9.2d,v11.2d
+ zip2 v3.2d,v9.2d,v11.2d
+ zip1 v8.4s,v4.4s,v5.4s
+ zip2 v9.4s,v4.4s,v5.4s
+ zip1 v10.4s,v6.4s,v7.4s
+ zip2 v11.4s,v6.4s,v7.4s
+ zip1 v4.2d,v8.2d,v10.2d
+ zip2 v5.2d,v8.2d,v10.2d
+ zip1 v6.2d,v9.2d,v11.2d
+ zip2 v7.2d,v9.2d,v11.2d
+ mov v12.d[0],x12
+ mov v12.d[1],x13
+#ifdef __AARCH64EB__
+ rev32 v12.16b,v12.16b
+#endif
+ mov w7,0x87
+ extr x9,x27,x27,#32
+ extr x13,x27,x26,#63
+ and w8,w7,w9,asr#31
+ eor x12,x8,x26,lsl#1
+ mov v13.d[0],x14
+ mov v13.d[1],x15
+#ifdef __AARCH64EB__
+ rev32 v13.16b,v13.16b
+#endif
+ mov w7,0x87
+ extr x9,x13,x13,#32
+ extr x15,x13,x12,#63
+ and w8,w7,w9,asr#31
+ eor x14,x8,x12,lsl#1
+ mov v14.d[0],x16
+ mov v14.d[1],x17
+#ifdef __AARCH64EB__
+ rev32 v14.16b,v14.16b
+#endif
+ mov w7,0x87
+ extr x9,x15,x15,#32
+ extr x17,x15,x14,#63
+ and w8,w7,w9,asr#31
+ eor x16,x8,x14,lsl#1
+ mov v15.d[0],x18
+ mov v15.d[1],x19
+#ifdef __AARCH64EB__
+ rev32 v15.16b,v15.16b
+#endif
+ mov w7,0x87
+ extr x9,x17,x17,#32
+ extr x19,x17,x16,#63
+ and w8,w7,w9,asr#31
+ eor x18,x8,x16,lsl#1
+ mov v8.d[0],x20
+ mov v8.d[1],x21
+#ifdef __AARCH64EB__
+ rev32 v8.16b,v8.16b
+#endif
+ mov w7,0x87
+ extr x9,x19,x19,#32
+ extr x21,x19,x18,#63
+ and w8,w7,w9,asr#31
+ eor x20,x8,x18,lsl#1
+ mov v9.d[0],x22
+ mov v9.d[1],x23
+#ifdef __AARCH64EB__
+ rev32 v9.16b,v9.16b
+#endif
+ mov w7,0x87
+ extr x9,x21,x21,#32
+ extr x23,x21,x20,#63
+ and w8,w7,w9,asr#31
+ eor x22,x8,x20,lsl#1
+ mov v10.d[0],x24
+ mov v10.d[1],x25
+#ifdef __AARCH64EB__
+ rev32 v10.16b,v10.16b
+#endif
+ mov w7,0x87
+ extr x9,x23,x23,#32
+ extr x25,x23,x22,#63
+ and w8,w7,w9,asr#31
+ eor x24,x8,x22,lsl#1
+ mov v11.d[0],x26
+ mov v11.d[1],x27
+#ifdef __AARCH64EB__
+ rev32 v11.16b,v11.16b
+#endif
+ mov w7,0x87
+ extr x9,x25,x25,#32
+ extr x27,x25,x24,#63
+ and w8,w7,w9,asr#31
+ eor x26,x8,x24,lsl#1
+ eor v0.16b, v0.16b, v12.16b
+ eor v1.16b, v1.16b, v13.16b
+ eor v2.16b, v2.16b, v14.16b
+ eor v3.16b, v3.16b, v15.16b
+ eor v4.16b, v4.16b, v8.16b
+ eor v5.16b, v5.16b, v9.16b
+ eor v6.16b, v6.16b, v10.16b
+ eor v7.16b, v7.16b, v11.16b
+
+ // save the last tweak
+ st1 {v11.4s},[x5]
+ st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
+ st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64
+ subs x2,x2,#8
+ b.gt .Lxts_8_blocks_process
+ b 100f
+.Lxts_4_blocks_process:
+ mov v8.d[0],x12
+ mov v8.d[1],x13
+#ifdef __AARCH64EB__
+ rev32 v8.16b,v8.16b
+#endif
+ mov v9.d[0],x14
+ mov v9.d[1],x15
+#ifdef __AARCH64EB__
+ rev32 v9.16b,v9.16b
+#endif
+ mov v10.d[0],x16
+ mov v10.d[1],x17
+#ifdef __AARCH64EB__
+ rev32 v10.16b,v10.16b
+#endif
+ mov v11.d[0],x18
+ mov v11.d[1],x19
+#ifdef __AARCH64EB__
+ rev32 v11.16b,v11.16b
+#endif
+ cmp x2,#4
+ b.lt 1f
+ ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
+ eor v4.16b, v4.16b, v8.16b
+ eor v5.16b, v5.16b, v9.16b
+ eor v6.16b, v6.16b, v10.16b
+ eor v7.16b, v7.16b, v11.16b
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v5.16b,v5.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v6.16b,v6.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v7.16b,v7.16b
+#endif
+ zip1 v0.4s,v4.4s,v5.4s
+ zip2 v1.4s,v4.4s,v5.4s
+ zip1 v2.4s,v6.4s,v7.4s
+ zip2 v3.4s,v6.4s,v7.4s
+ zip1 v4.2d,v0.2d,v2.2d
+ zip2 v5.2d,v0.2d,v2.2d
+ zip1 v6.2d,v1.2d,v3.2d
+ zip2 v7.2d,v1.2d,v3.2d
+ bl _vpsm4_enc_4blks
+ zip1 v4.4s,v0.4s,v1.4s
+ zip2 v5.4s,v0.4s,v1.4s
+ zip1 v6.4s,v2.4s,v3.4s
+ zip2 v7.4s,v2.4s,v3.4s
+ zip1 v0.2d,v4.2d,v6.2d
+ zip2 v1.2d,v4.2d,v6.2d
+ zip1 v2.2d,v5.2d,v7.2d
+ zip2 v3.2d,v5.2d,v7.2d
+ eor v0.16b, v0.16b, v8.16b
+ eor v1.16b, v1.16b, v9.16b
+ eor v2.16b, v2.16b, v10.16b
+ eor v3.16b, v3.16b, v11.16b
+ st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
+ sub x2,x2,#4
+ mov v8.d[0],x20
+ mov v8.d[1],x21
+#ifdef __AARCH64EB__
+ rev32 v8.16b,v8.16b
+#endif
+ mov v9.d[0],x22
+ mov v9.d[1],x23
+#ifdef __AARCH64EB__
+ rev32 v9.16b,v9.16b
+#endif
+ mov v10.d[0],x24
+ mov v10.d[1],x25
+#ifdef __AARCH64EB__
+ rev32 v10.16b,v10.16b
+#endif
+ // save the last tweak
+ st1 {v11.4s},[x5]
+1:
+ // process last block
+ cmp x2,#1
+ b.lt 100f
+ b.gt 1f
+ ld1 {v4.4s},[x0],#16
+ eor v4.16b, v4.16b, v8.16b
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+ mov x10,x3
+ mov w11,#8
+ mov w12,v4.s[0]
+ mov w13,v4.s[1]
+ mov w14,v4.s[2]
+ mov w15,v4.s[3]
+10:
+ ldp w7,w8,[x10],8
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ eor w6,w14,w15
+ eor w9,w7,w13
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w12,w12,w6
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ eor w6,w14,w15
+ eor w9,w12,w8
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ ldp w7,w8,[x10],8
+ eor w13,w13,w6
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ eor w6,w12,w13
+ eor w9,w7,w15
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w14,w14,w6
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ eor w6,w12,w13
+ eor w9,w14,w8
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w15,w15,w6
+ subs w11,w11,#1
+ b.ne 10b
+ mov v4.s[0],w15
+ mov v4.s[1],w14
+ mov v4.s[2],w13
+ mov v4.s[3],w12
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+ eor v4.16b, v4.16b, v8.16b
+ st1 {v4.4s},[x1],#16
+ // save the last tweak
+ st1 {v8.4s},[x5]
+ b 100f
+1: // process last 2 blocks
+ cmp x2,#2
+ b.gt 1f
+ ld1 {v4.4s,v5.4s},[x0],#32
+ eor v4.16b, v4.16b, v8.16b
+ eor v5.16b, v5.16b, v9.16b
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v5.16b,v5.16b
+#endif
+ zip1 v0.4s,v4.4s,v5.4s
+ zip2 v1.4s,v4.4s,v5.4s
+ zip1 v2.4s,v6.4s,v7.4s
+ zip2 v3.4s,v6.4s,v7.4s
+ zip1 v4.2d,v0.2d,v2.2d
+ zip2 v5.2d,v0.2d,v2.2d
+ zip1 v6.2d,v1.2d,v3.2d
+ zip2 v7.2d,v1.2d,v3.2d
+ bl _vpsm4_enc_4blks
+ zip1 v4.4s,v0.4s,v1.4s
+ zip2 v5.4s,v0.4s,v1.4s
+ zip1 v6.4s,v2.4s,v3.4s
+ zip2 v7.4s,v2.4s,v3.4s
+ zip1 v0.2d,v4.2d,v6.2d
+ zip2 v1.2d,v4.2d,v6.2d
+ zip1 v2.2d,v5.2d,v7.2d
+ zip2 v3.2d,v5.2d,v7.2d
+ eor v0.16b, v0.16b, v8.16b
+ eor v1.16b, v1.16b, v9.16b
+ st1 {v0.4s,v1.4s},[x1],#32
+ // save the last tweak
+ st1 {v9.4s},[x5]
+ b 100f
+1: // process last 3 blocks
+ ld1 {v4.4s,v5.4s,v6.4s},[x0],#48
+ eor v4.16b, v4.16b, v8.16b
+ eor v5.16b, v5.16b, v9.16b
+ eor v6.16b, v6.16b, v10.16b
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v5.16b,v5.16b
+#endif
+#ifndef __AARCH64EB__
+ rev32 v6.16b,v6.16b
+#endif
+ zip1 v0.4s,v4.4s,v5.4s
+ zip2 v1.4s,v4.4s,v5.4s
+ zip1 v2.4s,v6.4s,v7.4s
+ zip2 v3.4s,v6.4s,v7.4s
+ zip1 v4.2d,v0.2d,v2.2d
+ zip2 v5.2d,v0.2d,v2.2d
+ zip1 v6.2d,v1.2d,v3.2d
+ zip2 v7.2d,v1.2d,v3.2d
+ bl _vpsm4_enc_4blks
+ zip1 v4.4s,v0.4s,v1.4s
+ zip2 v5.4s,v0.4s,v1.4s
+ zip1 v6.4s,v2.4s,v3.4s
+ zip2 v7.4s,v2.4s,v3.4s
+ zip1 v0.2d,v4.2d,v6.2d
+ zip2 v1.2d,v4.2d,v6.2d
+ zip1 v2.2d,v5.2d,v7.2d
+ zip2 v3.2d,v5.2d,v7.2d
+ eor v0.16b, v0.16b, v8.16b
+ eor v1.16b, v1.16b, v9.16b
+ eor v2.16b, v2.16b, v10.16b
+ st1 {v0.4s,v1.4s,v2.4s},[x1],#48
+ // save the last tweak
+ st1 {v10.4s},[x5]
+100:
+ cmp x29,0
+ b.eq .return
+
+// This branch calculates the last two tweaks,
+// while the encryption/decryption length is larger than 32
+.last_2blks_tweak:
+ ld1 {v8.4s},[x5]
+#ifdef __AARCH64EB__
+ rev32 v8.16b,v8.16b
+#endif
+ mov v2.16b,v8.16b
+ adrp x10,.Lxts_magic
+ ldr q0, [x10, #:lo12:.Lxts_magic]
+ shl v9.16b, v2.16b, #1
+ ext v1.16b, v2.16b, v2.16b,#15
+ ushr v1.16b, v1.16b, #7
+ mul v1.16b, v1.16b, v0.16b
+ eor v9.16b, v9.16b, v1.16b
+ mov v2.16b,v9.16b
+ adrp x10,.Lxts_magic
+ ldr q0, [x10, #:lo12:.Lxts_magic]
+ shl v10.16b, v2.16b, #1
+ ext v1.16b, v2.16b, v2.16b,#15
+ ushr v1.16b, v1.16b, #7
+ mul v1.16b, v1.16b, v0.16b
+ eor v10.16b, v10.16b, v1.16b
+ b .check_dec
+
+
+// This branch calculates the last two tweaks,
+// while the encryption/decryption length is equal to 32, who only need two tweaks
+.only_2blks_tweak:
+ mov v9.16b,v8.16b
+#ifdef __AARCH64EB__
+ rev32 v9.16b,v9.16b
+#endif
+ mov v2.16b,v9.16b
+ adrp x10,.Lxts_magic
+ ldr q0, [x10, #:lo12:.Lxts_magic]
+ shl v10.16b, v2.16b, #1
+ ext v1.16b, v2.16b, v2.16b,#15
+ ushr v1.16b, v1.16b, #7
+ mul v1.16b, v1.16b, v0.16b
+ eor v10.16b, v10.16b, v1.16b
+ b .check_dec
+
+
+// Determine whether encryption or decryption is required.
+// The last two tweaks need to be swapped for decryption.
+.check_dec:
+ // encryption:1 decryption:0
+ cmp w28,1
+ b.eq .process_last_2blks
+ mov v0.16B,v9.16b
+ mov v9.16B,v10.16b
+ mov v10.16B,v0.16b
+
+.process_last_2blks:
+#ifdef __AARCH64EB__
+ rev32 v9.16b,v9.16b
+#endif
+#ifdef __AARCH64EB__
+ rev32 v10.16b,v10.16b
+#endif
+ ld1 {v4.4s},[x0],#16
+ eor v4.16b, v4.16b, v9.16b
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+ mov x10,x3
+ mov w11,#8
+ mov w12,v4.s[0]
+ mov w13,v4.s[1]
+ mov w14,v4.s[2]
+ mov w15,v4.s[3]
+10:
+ ldp w7,w8,[x10],8
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ eor w6,w14,w15
+ eor w9,w7,w13
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w12,w12,w6
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ eor w6,w14,w15
+ eor w9,w12,w8
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ ldp w7,w8,[x10],8
+ eor w13,w13,w6
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ eor w6,w12,w13
+ eor w9,w7,w15
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w14,w14,w6
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ eor w6,w12,w13
+ eor w9,w14,w8
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w15,w15,w6
+ subs w11,w11,#1
+ b.ne 10b
+ mov v4.s[0],w15
+ mov v4.s[1],w14
+ mov v4.s[2],w13
+ mov v4.s[3],w12
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+ eor v4.16b, v4.16b, v9.16b
+ st1 {v4.4s},[x1],#16
+
+ sub x26,x1,16
+.loop:
+ subs x29,x29,1
+ ldrb w7,[x26,x29]
+ ldrb w8,[x0,x29]
+ strb w8,[x26,x29]
+ strb w7,[x1,x29]
+ b.gt .loop
+ ld1 {v4.4s}, [x26]
+ eor v4.16b, v4.16b, v10.16b
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+ mov x10,x3
+ mov w11,#8
+ mov w12,v4.s[0]
+ mov w13,v4.s[1]
+ mov w14,v4.s[2]
+ mov w15,v4.s[3]
+10:
+ ldp w7,w8,[x10],8
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ eor w6,w14,w15
+ eor w9,w7,w13
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w12,w12,w6
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ eor w6,w14,w15
+ eor w9,w12,w8
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ ldp w7,w8,[x10],8
+ eor w13,w13,w6
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ eor w6,w12,w13
+ eor w9,w7,w15
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w14,w14,w6
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ eor w6,w12,w13
+ eor w9,w14,w8
+ eor w6,w6,w9
+ movi v1.16b,#64
+ movi v2.16b,#128
+ movi v3.16b,#192
+ mov v0.s[0],w6
+
+ sub v1.16b,v0.16b,v1.16b
+ sub v2.16b,v0.16b,v2.16b
+ sub v3.16b,v0.16b,v3.16b
+
+ tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
+ tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+ tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
+ tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
+
+ mov w6,v0.s[0]
+ mov w7,v1.s[0]
+ mov w9,v2.s[0]
+ add w7,w6,w7
+ mov w6,v3.s[0]
+ add w7,w7,w9
+ add w7,w7,w6
+
+ eor w6,w7,w7,ror #32-2
+ eor w6,w6,w7,ror #32-10
+ eor w6,w6,w7,ror #32-18
+ eor w6,w6,w7,ror #32-24
+ eor w15,w15,w6
+ subs w11,w11,#1
+ b.ne 10b
+ mov v4.s[0],w15
+ mov v4.s[1],w14
+ mov v4.s[2],w13
+ mov v4.s[3],w12
+#ifndef __AARCH64EB__
+ rev32 v4.16b,v4.16b
+#endif
+ eor v4.16b, v4.16b, v10.16b
+ st1 {v4.4s}, [x26]
+.return:
+ ldp d14, d15, [sp], #0x10
+ ldp d12, d13, [sp], #0x10
+ ldp d10, d11, [sp], #0x10
+ ldp d8, d9, [sp], #0x10
+ ldp x29, x30, [sp], #0x10
+ ldp x27, x28, [sp], #0x10
+ ldp x25, x26, [sp], #0x10
+ ldp x23, x24, [sp], #0x10
+ ldp x21, x22, [sp], #0x10
+ ldp x19, x20, [sp], #0x10
+ ldp x17, x18, [sp], #0x10
+ ldp x15, x16, [sp], #0x10
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size vpsm4_xts_encrypt,.-vpsm4_xts_encrypt