diff options
Diffstat (limited to 'sys/crypto/openssl/aarch64/vpsm4_ex-armv8.S')
-rw-r--r-- | sys/crypto/openssl/aarch64/vpsm4_ex-armv8.S | 4523 |
1 files changed, 4523 insertions, 0 deletions
diff --git a/sys/crypto/openssl/aarch64/vpsm4_ex-armv8.S b/sys/crypto/openssl/aarch64/vpsm4_ex-armv8.S new file mode 100644 index 000000000000..5627d6d1c6b4 --- /dev/null +++ b/sys/crypto/openssl/aarch64/vpsm4_ex-armv8.S @@ -0,0 +1,4523 @@ +/* Do not modify. This file is auto-generated from vpsm4_ex-armv8.pl. */ +// Copyright 2022-2025 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License 2.0 (the "License"). You may not use +// this file except in compliance with the License. You can obtain a copy +// in the file LICENSE in the source distribution or at +// https://www.openssl.org/source/license.html + +// +// This module implements SM4 with ASIMD and AESE on AARCH64 +// +// Dec 2022 +// + +// $output is the last argument if it looks like a file (it has an extension) +// $flavour is the first argument if it doesn't look like a file +#include "arm_arch.h" +.arch armv8-a+crypto +.text + +.type _vpsm4_ex_consts,%object +.align 7 +_vpsm4_ex_consts: +.Lck: +.long 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269 +.long 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9 +.long 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249 +.long 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9 +.long 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229 +.long 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299 +.long 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209 +.long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279 +.Lfk: +.quad 0x56aa3350a3b1bac6,0xb27022dc677d9197 +.Lshuffles: +.quad 0x0B0A090807060504,0x030201000F0E0D0C +.Lxts_magic: +.quad 0x0101010101010187,0x0101010101010101 +.Lsbox_magic: +.quad 0x0b0e0104070a0d00,0x0306090c0f020508 +.quad 0x62185a2042387a00,0x22581a6002783a40 +.quad 0x15df62a89e54e923,0xc10bb67c4a803df7 +.quad 0xb9aa6b78c1d21300,0x1407c6d56c7fbead +.quad 0x6404462679195b3b,0xe383c1a1fe9edcbc +.quad 0x0f0f0f0f0f0f0f0f,0x0f0f0f0f0f0f0f0f + +.size _vpsm4_ex_consts,.-_vpsm4_ex_consts +.type _vpsm4_ex_set_key,%function +.align 4 +_vpsm4_ex_set_key: + AARCH64_VALID_CALL_TARGET + ld1 {v5.4s},[x0] + adrp x9, .Lsbox_magic + ldr q26, [x9, #:lo12:.Lsbox_magic] + ldr q27, [x9, #:lo12:.Lsbox_magic+16] + ldr q28, [x9, #:lo12:.Lsbox_magic+32] + ldr q29, [x9, #:lo12:.Lsbox_magic+48] + ldr q30, [x9, #:lo12:.Lsbox_magic+64] + ldr q31, [x9, #:lo12:.Lsbox_magic+80] +#ifndef __AARCH64EB__ + rev32 v5.16b,v5.16b +#endif + adrp x5,.Lshuffles + add x5,x5,#:lo12:.Lshuffles + ld1 {v7.2d},[x5] + adrp x5,.Lfk + add x5,x5,#:lo12:.Lfk + ld1 {v6.2d},[x5] + eor v5.16b,v5.16b,v6.16b + mov x6,#32 + adrp x5,.Lck + add x5,x5,#:lo12:.Lck + movi v0.16b,#64 + cbnz w2,1f + add x1,x1,124 +1: + mov w7,v5.s[1] + ldr w8,[x5],#4 + eor w8,w8,w7 + mov w7,v5.s[2] + eor w8,w8,w7 + mov w7,v5.s[3] + eor w8,w8,w7 + // optimize sbox using AESE instruction + mov v4.s[0],w8 + tbl v0.16b, {v4.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + mov w7,v0.s[0] + eor w8,w7,w7,ror #19 + eor w8,w8,w7,ror #9 + mov w7,v5.s[0] + eor w8,w8,w7 + mov v5.s[0],w8 + cbz w2,2f + str w8,[x1],#4 + b 3f +2: + str w8,[x1],#-4 +3: + tbl v5.16b,{v5.16b},v7.16b + subs x6,x6,#1 + b.ne 1b + ret +.size _vpsm4_ex_set_key,.-_vpsm4_ex_set_key +.type _vpsm4_ex_enc_4blks,%function +.align 4 +_vpsm4_ex_enc_4blks: + AARCH64_VALID_CALL_TARGET + mov x10,x3 + mov w11,#8 +10: + ldp w7,w8,[x10],8 + dup v12.4s,w7 + dup v13.4s,w8 + + // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) + eor v14.16b,v6.16b,v7.16b + eor v12.16b,v5.16b,v12.16b + eor v12.16b,v14.16b,v12.16b + // optimize sbox using AESE instruction + tbl v0.16b, {v12.16b}, v26.16b + ushr v24.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v24.16b, {v27.16b}, v24.16b + eor v0.16b, v0.16b, v24.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v24.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v24.16b, {v29.16b}, v24.16b + eor v0.16b, v0.16b, v24.16b + mov v12.16b,v0.16b + + // linear transformation + ushr v0.4s,v12.4s,32-2 + ushr v1.4s,v12.4s,32-10 + ushr v2.4s,v12.4s,32-18 + ushr v3.4s,v12.4s,32-24 + sli v0.4s,v12.4s,2 + sli v1.4s,v12.4s,10 + sli v2.4s,v12.4s,18 + sli v3.4s,v12.4s,24 + eor v24.16b,v0.16b,v12.16b + eor v24.16b,v24.16b,v1.16b + eor v12.16b,v2.16b,v3.16b + eor v12.16b,v12.16b,v24.16b + eor v4.16b,v4.16b,v12.16b + + // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) + eor v14.16b,v14.16b,v4.16b + eor v13.16b,v14.16b,v13.16b + // optimize sbox using AESE instruction + tbl v0.16b, {v13.16b}, v26.16b + ushr v24.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v24.16b, {v27.16b}, v24.16b + eor v0.16b, v0.16b, v24.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v24.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v24.16b, {v29.16b}, v24.16b + eor v0.16b, v0.16b, v24.16b + mov v13.16b,v0.16b + + // linear transformation + ushr v0.4s,v13.4s,32-2 + ushr v1.4s,v13.4s,32-10 + ushr v2.4s,v13.4s,32-18 + ushr v3.4s,v13.4s,32-24 + sli v0.4s,v13.4s,2 + sli v1.4s,v13.4s,10 + sli v2.4s,v13.4s,18 + sli v3.4s,v13.4s,24 + eor v24.16b,v0.16b,v13.16b + eor v24.16b,v24.16b,v1.16b + eor v13.16b,v2.16b,v3.16b + eor v13.16b,v13.16b,v24.16b + ldp w7,w8,[x10],8 + eor v5.16b,v5.16b,v13.16b + + dup v12.4s,w7 + dup v13.4s,w8 + + // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) + eor v14.16b,v4.16b,v5.16b + eor v12.16b,v7.16b,v12.16b + eor v12.16b,v14.16b,v12.16b + // optimize sbox using AESE instruction + tbl v0.16b, {v12.16b}, v26.16b + ushr v24.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v24.16b, {v27.16b}, v24.16b + eor v0.16b, v0.16b, v24.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v24.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v24.16b, {v29.16b}, v24.16b + eor v0.16b, v0.16b, v24.16b + mov v12.16b,v0.16b + + // linear transformation + ushr v0.4s,v12.4s,32-2 + ushr v1.4s,v12.4s,32-10 + ushr v2.4s,v12.4s,32-18 + ushr v3.4s,v12.4s,32-24 + sli v0.4s,v12.4s,2 + sli v1.4s,v12.4s,10 + sli v2.4s,v12.4s,18 + sli v3.4s,v12.4s,24 + eor v24.16b,v0.16b,v12.16b + eor v24.16b,v24.16b,v1.16b + eor v12.16b,v2.16b,v3.16b + eor v12.16b,v12.16b,v24.16b + eor v6.16b,v6.16b,v12.16b + + // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) + eor v14.16b,v14.16b,v6.16b + eor v13.16b,v14.16b,v13.16b + // optimize sbox using AESE instruction + tbl v0.16b, {v13.16b}, v26.16b + ushr v24.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v24.16b, {v27.16b}, v24.16b + eor v0.16b, v0.16b, v24.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v24.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v24.16b, {v29.16b}, v24.16b + eor v0.16b, v0.16b, v24.16b + mov v13.16b,v0.16b + + // linear transformation + ushr v0.4s,v13.4s,32-2 + ushr v1.4s,v13.4s,32-10 + ushr v2.4s,v13.4s,32-18 + ushr v3.4s,v13.4s,32-24 + sli v0.4s,v13.4s,2 + sli v1.4s,v13.4s,10 + sli v2.4s,v13.4s,18 + sli v3.4s,v13.4s,24 + eor v24.16b,v0.16b,v13.16b + eor v24.16b,v24.16b,v1.16b + eor v13.16b,v2.16b,v3.16b + eor v13.16b,v13.16b,v24.16b + eor v7.16b,v7.16b,v13.16b + subs w11,w11,#1 + b.ne 10b +#ifndef __AARCH64EB__ + rev32 v3.16b,v4.16b +#else + mov v3.16b,v4.16b +#endif +#ifndef __AARCH64EB__ + rev32 v2.16b,v5.16b +#else + mov v2.16b,v5.16b +#endif +#ifndef __AARCH64EB__ + rev32 v1.16b,v6.16b +#else + mov v1.16b,v6.16b +#endif +#ifndef __AARCH64EB__ + rev32 v0.16b,v7.16b +#else + mov v0.16b,v7.16b +#endif + ret +.size _vpsm4_ex_enc_4blks,.-_vpsm4_ex_enc_4blks +.type _vpsm4_ex_enc_8blks,%function +.align 4 +_vpsm4_ex_enc_8blks: + AARCH64_VALID_CALL_TARGET + mov x10,x3 + mov w11,#8 +10: + ldp w7,w8,[x10],8 + // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) + dup v12.4s,w7 + eor v14.16b,v6.16b,v7.16b + eor v15.16b,v10.16b,v11.16b + eor v0.16b,v5.16b,v12.16b + eor v1.16b,v9.16b,v12.16b + eor v12.16b,v14.16b,v0.16b + eor v13.16b,v15.16b,v1.16b + // optimize sbox using AESE instruction + tbl v0.16b, {v12.16b}, v26.16b + tbl v1.16b, {v13.16b}, v26.16b + ushr v24.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v24.16b, {v27.16b}, v24.16b + eor v0.16b, v0.16b, v24.16b + ushr v24.16b, v1.16b, 4 + and v1.16b, v1.16b, v31.16b + tbl v1.16b, {v28.16b}, v1.16b + tbl v24.16b, {v27.16b}, v24.16b + eor v1.16b, v1.16b, v24.16b + eor v25.16b, v25.16b, v25.16b + aese v0.16b,v25.16b + aese v1.16b,v25.16b + ushr v24.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v24.16b, {v29.16b}, v24.16b + eor v0.16b, v0.16b, v24.16b + ushr v24.16b, v1.16b, 4 + and v1.16b, v1.16b, v31.16b + tbl v1.16b, {v30.16b}, v1.16b + tbl v24.16b, {v29.16b}, v24.16b + eor v1.16b, v1.16b, v24.16b + mov v12.16b,v0.16b + mov v13.16b,v1.16b + + // linear transformation + ushr v0.4s,v12.4s,32-2 + ushr v25.4s,v13.4s,32-2 + ushr v1.4s,v12.4s,32-10 + ushr v2.4s,v12.4s,32-18 + ushr v3.4s,v12.4s,32-24 + sli v0.4s,v12.4s,2 + sli v25.4s,v13.4s,2 + sli v1.4s,v12.4s,10 + sli v2.4s,v12.4s,18 + sli v3.4s,v12.4s,24 + eor v24.16b,v0.16b,v12.16b + eor v24.16b,v24.16b,v1.16b + eor v12.16b,v2.16b,v3.16b + eor v12.16b,v12.16b,v24.16b + ushr v1.4s,v13.4s,32-10 + ushr v2.4s,v13.4s,32-18 + ushr v3.4s,v13.4s,32-24 + sli v1.4s,v13.4s,10 + sli v2.4s,v13.4s,18 + sli v3.4s,v13.4s,24 + eor v24.16b,v25.16b,v13.16b + eor v24.16b,v24.16b,v1.16b + eor v13.16b,v2.16b,v3.16b + eor v13.16b,v13.16b,v24.16b + eor v4.16b,v4.16b,v12.16b + eor v8.16b,v8.16b,v13.16b + + // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) + dup v13.4s,w8 + eor v14.16b,v14.16b,v4.16b + eor v15.16b,v15.16b,v8.16b + eor v12.16b,v14.16b,v13.16b + eor v13.16b,v15.16b,v13.16b + // optimize sbox using AESE instruction + tbl v0.16b, {v12.16b}, v26.16b + tbl v1.16b, {v13.16b}, v26.16b + ushr v24.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v24.16b, {v27.16b}, v24.16b + eor v0.16b, v0.16b, v24.16b + ushr v24.16b, v1.16b, 4 + and v1.16b, v1.16b, v31.16b + tbl v1.16b, {v28.16b}, v1.16b + tbl v24.16b, {v27.16b}, v24.16b + eor v1.16b, v1.16b, v24.16b + eor v25.16b, v25.16b, v25.16b + aese v0.16b,v25.16b + aese v1.16b,v25.16b + ushr v24.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v24.16b, {v29.16b}, v24.16b + eor v0.16b, v0.16b, v24.16b + ushr v24.16b, v1.16b, 4 + and v1.16b, v1.16b, v31.16b + tbl v1.16b, {v30.16b}, v1.16b + tbl v24.16b, {v29.16b}, v24.16b + eor v1.16b, v1.16b, v24.16b + mov v12.16b,v0.16b + mov v13.16b,v1.16b + + // linear transformation + ushr v0.4s,v12.4s,32-2 + ushr v25.4s,v13.4s,32-2 + ushr v1.4s,v12.4s,32-10 + ushr v2.4s,v12.4s,32-18 + ushr v3.4s,v12.4s,32-24 + sli v0.4s,v12.4s,2 + sli v25.4s,v13.4s,2 + sli v1.4s,v12.4s,10 + sli v2.4s,v12.4s,18 + sli v3.4s,v12.4s,24 + eor v24.16b,v0.16b,v12.16b + eor v24.16b,v24.16b,v1.16b + eor v12.16b,v2.16b,v3.16b + eor v12.16b,v12.16b,v24.16b + ushr v1.4s,v13.4s,32-10 + ushr v2.4s,v13.4s,32-18 + ushr v3.4s,v13.4s,32-24 + sli v1.4s,v13.4s,10 + sli v2.4s,v13.4s,18 + sli v3.4s,v13.4s,24 + eor v24.16b,v25.16b,v13.16b + eor v24.16b,v24.16b,v1.16b + eor v13.16b,v2.16b,v3.16b + eor v13.16b,v13.16b,v24.16b + ldp w7,w8,[x10],8 + eor v5.16b,v5.16b,v12.16b + eor v9.16b,v9.16b,v13.16b + + // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) + dup v12.4s,w7 + eor v14.16b,v4.16b,v5.16b + eor v15.16b,v8.16b,v9.16b + eor v0.16b,v7.16b,v12.16b + eor v1.16b,v11.16b,v12.16b + eor v12.16b,v14.16b,v0.16b + eor v13.16b,v15.16b,v1.16b + // optimize sbox using AESE instruction + tbl v0.16b, {v12.16b}, v26.16b + tbl v1.16b, {v13.16b}, v26.16b + ushr v24.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v24.16b, {v27.16b}, v24.16b + eor v0.16b, v0.16b, v24.16b + ushr v24.16b, v1.16b, 4 + and v1.16b, v1.16b, v31.16b + tbl v1.16b, {v28.16b}, v1.16b + tbl v24.16b, {v27.16b}, v24.16b + eor v1.16b, v1.16b, v24.16b + eor v25.16b, v25.16b, v25.16b + aese v0.16b,v25.16b + aese v1.16b,v25.16b + ushr v24.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v24.16b, {v29.16b}, v24.16b + eor v0.16b, v0.16b, v24.16b + ushr v24.16b, v1.16b, 4 + and v1.16b, v1.16b, v31.16b + tbl v1.16b, {v30.16b}, v1.16b + tbl v24.16b, {v29.16b}, v24.16b + eor v1.16b, v1.16b, v24.16b + mov v12.16b,v0.16b + mov v13.16b,v1.16b + + // linear transformation + ushr v0.4s,v12.4s,32-2 + ushr v25.4s,v13.4s,32-2 + ushr v1.4s,v12.4s,32-10 + ushr v2.4s,v12.4s,32-18 + ushr v3.4s,v12.4s,32-24 + sli v0.4s,v12.4s,2 + sli v25.4s,v13.4s,2 + sli v1.4s,v12.4s,10 + sli v2.4s,v12.4s,18 + sli v3.4s,v12.4s,24 + eor v24.16b,v0.16b,v12.16b + eor v24.16b,v24.16b,v1.16b + eor v12.16b,v2.16b,v3.16b + eor v12.16b,v12.16b,v24.16b + ushr v1.4s,v13.4s,32-10 + ushr v2.4s,v13.4s,32-18 + ushr v3.4s,v13.4s,32-24 + sli v1.4s,v13.4s,10 + sli v2.4s,v13.4s,18 + sli v3.4s,v13.4s,24 + eor v24.16b,v25.16b,v13.16b + eor v24.16b,v24.16b,v1.16b + eor v13.16b,v2.16b,v3.16b + eor v13.16b,v13.16b,v24.16b + eor v6.16b,v6.16b,v12.16b + eor v10.16b,v10.16b,v13.16b + + // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) + dup v13.4s,w8 + eor v14.16b,v14.16b,v6.16b + eor v15.16b,v15.16b,v10.16b + eor v12.16b,v14.16b,v13.16b + eor v13.16b,v15.16b,v13.16b + // optimize sbox using AESE instruction + tbl v0.16b, {v12.16b}, v26.16b + tbl v1.16b, {v13.16b}, v26.16b + ushr v24.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v24.16b, {v27.16b}, v24.16b + eor v0.16b, v0.16b, v24.16b + ushr v24.16b, v1.16b, 4 + and v1.16b, v1.16b, v31.16b + tbl v1.16b, {v28.16b}, v1.16b + tbl v24.16b, {v27.16b}, v24.16b + eor v1.16b, v1.16b, v24.16b + eor v25.16b, v25.16b, v25.16b + aese v0.16b,v25.16b + aese v1.16b,v25.16b + ushr v24.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v24.16b, {v29.16b}, v24.16b + eor v0.16b, v0.16b, v24.16b + ushr v24.16b, v1.16b, 4 + and v1.16b, v1.16b, v31.16b + tbl v1.16b, {v30.16b}, v1.16b + tbl v24.16b, {v29.16b}, v24.16b + eor v1.16b, v1.16b, v24.16b + mov v12.16b,v0.16b + mov v13.16b,v1.16b + + // linear transformation + ushr v0.4s,v12.4s,32-2 + ushr v25.4s,v13.4s,32-2 + ushr v1.4s,v12.4s,32-10 + ushr v2.4s,v12.4s,32-18 + ushr v3.4s,v12.4s,32-24 + sli v0.4s,v12.4s,2 + sli v25.4s,v13.4s,2 + sli v1.4s,v12.4s,10 + sli v2.4s,v12.4s,18 + sli v3.4s,v12.4s,24 + eor v24.16b,v0.16b,v12.16b + eor v24.16b,v24.16b,v1.16b + eor v12.16b,v2.16b,v3.16b + eor v12.16b,v12.16b,v24.16b + ushr v1.4s,v13.4s,32-10 + ushr v2.4s,v13.4s,32-18 + ushr v3.4s,v13.4s,32-24 + sli v1.4s,v13.4s,10 + sli v2.4s,v13.4s,18 + sli v3.4s,v13.4s,24 + eor v24.16b,v25.16b,v13.16b + eor v24.16b,v24.16b,v1.16b + eor v13.16b,v2.16b,v3.16b + eor v13.16b,v13.16b,v24.16b + eor v7.16b,v7.16b,v12.16b + eor v11.16b,v11.16b,v13.16b + subs w11,w11,#1 + b.ne 10b +#ifndef __AARCH64EB__ + rev32 v3.16b,v4.16b +#else + mov v3.16b,v4.16b +#endif +#ifndef __AARCH64EB__ + rev32 v2.16b,v5.16b +#else + mov v2.16b,v5.16b +#endif +#ifndef __AARCH64EB__ + rev32 v1.16b,v6.16b +#else + mov v1.16b,v6.16b +#endif +#ifndef __AARCH64EB__ + rev32 v0.16b,v7.16b +#else + mov v0.16b,v7.16b +#endif +#ifndef __AARCH64EB__ + rev32 v7.16b,v8.16b +#else + mov v7.16b,v8.16b +#endif +#ifndef __AARCH64EB__ + rev32 v6.16b,v9.16b +#else + mov v6.16b,v9.16b +#endif +#ifndef __AARCH64EB__ + rev32 v5.16b,v10.16b +#else + mov v5.16b,v10.16b +#endif +#ifndef __AARCH64EB__ + rev32 v4.16b,v11.16b +#else + mov v4.16b,v11.16b +#endif + ret +.size _vpsm4_ex_enc_8blks,.-_vpsm4_ex_enc_8blks +.globl vpsm4_ex_set_encrypt_key +.type vpsm4_ex_set_encrypt_key,%function +.align 5 +vpsm4_ex_set_encrypt_key: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + mov w2,1 + bl _vpsm4_ex_set_key + ldp x29,x30,[sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size vpsm4_ex_set_encrypt_key,.-vpsm4_ex_set_encrypt_key +.globl vpsm4_ex_set_decrypt_key +.type vpsm4_ex_set_decrypt_key,%function +.align 5 +vpsm4_ex_set_decrypt_key: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + mov w2,0 + bl _vpsm4_ex_set_key + ldp x29,x30,[sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size vpsm4_ex_set_decrypt_key,.-vpsm4_ex_set_decrypt_key +.globl vpsm4_ex_encrypt +.type vpsm4_ex_encrypt,%function +.align 5 +vpsm4_ex_encrypt: + AARCH64_VALID_CALL_TARGET + ld1 {v4.4s},[x0] + adrp x9, .Lsbox_magic + ldr q26, [x9, #:lo12:.Lsbox_magic] + ldr q27, [x9, #:lo12:.Lsbox_magic+16] + ldr q28, [x9, #:lo12:.Lsbox_magic+32] + ldr q29, [x9, #:lo12:.Lsbox_magic+48] + ldr q30, [x9, #:lo12:.Lsbox_magic+64] + ldr q31, [x9, #:lo12:.Lsbox_magic+80] +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif + mov x3,x2 + mov x10,x3 + mov w11,#8 + mov w12,v4.s[0] + mov w13,v4.s[1] + mov w14,v4.s[2] + mov w15,v4.s[3] +10: + ldp w7,w8,[x10],8 + // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) + eor w6,w14,w15 + eor w9,w7,w13 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w12,w12,w6 + // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) + eor w6,w14,w15 + eor w9,w12,w8 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + ldp w7,w8,[x10],8 + eor w13,w13,w6 + // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) + eor w6,w12,w13 + eor w9,w7,w15 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w14,w14,w6 + // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) + eor w6,w12,w13 + eor w9,w14,w8 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w15,w15,w6 + subs w11,w11,#1 + b.ne 10b + mov v4.s[0],w15 + mov v4.s[1],w14 + mov v4.s[2],w13 + mov v4.s[3],w12 +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif + st1 {v4.4s},[x1] + ret +.size vpsm4_ex_encrypt,.-vpsm4_ex_encrypt +.globl vpsm4_ex_decrypt +.type vpsm4_ex_decrypt,%function +.align 5 +vpsm4_ex_decrypt: + AARCH64_VALID_CALL_TARGET + ld1 {v4.4s},[x0] + adrp x9, .Lsbox_magic + ldr q26, [x9, #:lo12:.Lsbox_magic] + ldr q27, [x9, #:lo12:.Lsbox_magic+16] + ldr q28, [x9, #:lo12:.Lsbox_magic+32] + ldr q29, [x9, #:lo12:.Lsbox_magic+48] + ldr q30, [x9, #:lo12:.Lsbox_magic+64] + ldr q31, [x9, #:lo12:.Lsbox_magic+80] +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif + mov x3,x2 + mov x10,x3 + mov w11,#8 + mov w12,v4.s[0] + mov w13,v4.s[1] + mov w14,v4.s[2] + mov w15,v4.s[3] +10: + ldp w7,w8,[x10],8 + // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) + eor w6,w14,w15 + eor w9,w7,w13 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w12,w12,w6 + // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) + eor w6,w14,w15 + eor w9,w12,w8 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + ldp w7,w8,[x10],8 + eor w13,w13,w6 + // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) + eor w6,w12,w13 + eor w9,w7,w15 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w14,w14,w6 + // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) + eor w6,w12,w13 + eor w9,w14,w8 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w15,w15,w6 + subs w11,w11,#1 + b.ne 10b + mov v4.s[0],w15 + mov v4.s[1],w14 + mov v4.s[2],w13 + mov v4.s[3],w12 +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif + st1 {v4.4s},[x1] + ret +.size vpsm4_ex_decrypt,.-vpsm4_ex_decrypt +.globl vpsm4_ex_ecb_encrypt +.type vpsm4_ex_ecb_encrypt,%function +.align 5 +vpsm4_ex_ecb_encrypt: + AARCH64_SIGN_LINK_REGISTER + // convert length into blocks + lsr x2,x2,4 + stp d8,d9,[sp,#-80]! + stp d10,d11,[sp,#16] + stp d12,d13,[sp,#32] + stp d14,d15,[sp,#48] + stp x29,x30,[sp,#64] + adrp x9, .Lsbox_magic + ldr q26, [x9, #:lo12:.Lsbox_magic] + ldr q27, [x9, #:lo12:.Lsbox_magic+16] + ldr q28, [x9, #:lo12:.Lsbox_magic+32] + ldr q29, [x9, #:lo12:.Lsbox_magic+48] + ldr q30, [x9, #:lo12:.Lsbox_magic+64] + ldr q31, [x9, #:lo12:.Lsbox_magic+80] +.Lecb_8_blocks_process: + cmp w2,#8 + b.lt .Lecb_4_blocks_process + ld4 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64 + ld4 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64 +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif +#ifndef __AARCH64EB__ + rev32 v5.16b,v5.16b +#endif +#ifndef __AARCH64EB__ + rev32 v6.16b,v6.16b +#endif +#ifndef __AARCH64EB__ + rev32 v7.16b,v7.16b +#endif +#ifndef __AARCH64EB__ + rev32 v8.16b,v8.16b +#endif +#ifndef __AARCH64EB__ + rev32 v9.16b,v9.16b +#endif +#ifndef __AARCH64EB__ + rev32 v10.16b,v10.16b +#endif +#ifndef __AARCH64EB__ + rev32 v11.16b,v11.16b +#endif + bl _vpsm4_ex_enc_8blks + st4 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64 + st4 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64 + subs w2,w2,#8 + b.gt .Lecb_8_blocks_process + b 100f +.Lecb_4_blocks_process: + cmp w2,#4 + b.lt 1f + ld4 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64 +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif +#ifndef __AARCH64EB__ + rev32 v5.16b,v5.16b +#endif +#ifndef __AARCH64EB__ + rev32 v6.16b,v6.16b +#endif +#ifndef __AARCH64EB__ + rev32 v7.16b,v7.16b +#endif + bl _vpsm4_ex_enc_4blks + st4 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64 + sub w2,w2,#4 +1: + // process last block + cmp w2,#1 + b.lt 100f + b.gt 1f + ld1 {v4.4s},[x0] +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif + mov x10,x3 + mov w11,#8 + mov w12,v4.s[0] + mov w13,v4.s[1] + mov w14,v4.s[2] + mov w15,v4.s[3] +10: + ldp w7,w8,[x10],8 + // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) + eor w6,w14,w15 + eor w9,w7,w13 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w12,w12,w6 + // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) + eor w6,w14,w15 + eor w9,w12,w8 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + ldp w7,w8,[x10],8 + eor w13,w13,w6 + // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) + eor w6,w12,w13 + eor w9,w7,w15 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w14,w14,w6 + // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) + eor w6,w12,w13 + eor w9,w14,w8 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w15,w15,w6 + subs w11,w11,#1 + b.ne 10b + mov v4.s[0],w15 + mov v4.s[1],w14 + mov v4.s[2],w13 + mov v4.s[3],w12 +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif + st1 {v4.4s},[x1] + b 100f +1: // process last 2 blocks + ld4 {v4.s,v5.s,v6.s,v7.s}[0],[x0],#16 + ld4 {v4.s,v5.s,v6.s,v7.s}[1],[x0],#16 + cmp w2,#2 + b.gt 1f +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif +#ifndef __AARCH64EB__ + rev32 v5.16b,v5.16b +#endif +#ifndef __AARCH64EB__ + rev32 v6.16b,v6.16b +#endif +#ifndef __AARCH64EB__ + rev32 v7.16b,v7.16b +#endif + bl _vpsm4_ex_enc_4blks + st4 {v0.s,v1.s,v2.s,v3.s}[0],[x1],#16 + st4 {v0.s,v1.s,v2.s,v3.s}[1],[x1] + b 100f +1: // process last 3 blocks + ld4 {v4.s,v5.s,v6.s,v7.s}[2],[x0],#16 +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif +#ifndef __AARCH64EB__ + rev32 v5.16b,v5.16b +#endif +#ifndef __AARCH64EB__ + rev32 v6.16b,v6.16b +#endif +#ifndef __AARCH64EB__ + rev32 v7.16b,v7.16b +#endif + bl _vpsm4_ex_enc_4blks + st4 {v0.s,v1.s,v2.s,v3.s}[0],[x1],#16 + st4 {v0.s,v1.s,v2.s,v3.s}[1],[x1],#16 + st4 {v0.s,v1.s,v2.s,v3.s}[2],[x1] +100: + ldp d10,d11,[sp,#16] + ldp d12,d13,[sp,#32] + ldp d14,d15,[sp,#48] + ldp x29,x30,[sp,#64] + ldp d8,d9,[sp],#80 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size vpsm4_ex_ecb_encrypt,.-vpsm4_ex_ecb_encrypt +.globl vpsm4_ex_cbc_encrypt +.type vpsm4_ex_cbc_encrypt,%function +.align 5 +vpsm4_ex_cbc_encrypt: + AARCH64_VALID_CALL_TARGET + lsr x2,x2,4 + adrp x9, .Lsbox_magic + ldr q26, [x9, #:lo12:.Lsbox_magic] + ldr q27, [x9, #:lo12:.Lsbox_magic+16] + ldr q28, [x9, #:lo12:.Lsbox_magic+32] + ldr q29, [x9, #:lo12:.Lsbox_magic+48] + ldr q30, [x9, #:lo12:.Lsbox_magic+64] + ldr q31, [x9, #:lo12:.Lsbox_magic+80] + cbz w5,.Ldec + ld1 {v3.4s},[x4] +.Lcbc_4_blocks_enc: + cmp w2,#4 + b.lt 1f + ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64 + eor v4.16b,v4.16b,v3.16b +#ifndef __AARCH64EB__ + rev32 v5.16b,v5.16b +#endif +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif +#ifndef __AARCH64EB__ + rev32 v6.16b,v6.16b +#endif +#ifndef __AARCH64EB__ + rev32 v7.16b,v7.16b +#endif + mov x10,x3 + mov w11,#8 + mov w12,v4.s[0] + mov w13,v4.s[1] + mov w14,v4.s[2] + mov w15,v4.s[3] +10: + ldp w7,w8,[x10],8 + // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) + eor w6,w14,w15 + eor w9,w7,w13 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w12,w12,w6 + // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) + eor w6,w14,w15 + eor w9,w12,w8 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + ldp w7,w8,[x10],8 + eor w13,w13,w6 + // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) + eor w6,w12,w13 + eor w9,w7,w15 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w14,w14,w6 + // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) + eor w6,w12,w13 + eor w9,w14,w8 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w15,w15,w6 + subs w11,w11,#1 + b.ne 10b + mov v4.s[0],w15 + mov v4.s[1],w14 + mov v4.s[2],w13 + mov v4.s[3],w12 + eor v5.16b,v5.16b,v4.16b + mov x10,x3 + mov w11,#8 + mov w12,v5.s[0] + mov w13,v5.s[1] + mov w14,v5.s[2] + mov w15,v5.s[3] +10: + ldp w7,w8,[x10],8 + // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) + eor w6,w14,w15 + eor w9,w7,w13 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w12,w12,w6 + // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) + eor w6,w14,w15 + eor w9,w12,w8 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + ldp w7,w8,[x10],8 + eor w13,w13,w6 + // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) + eor w6,w12,w13 + eor w9,w7,w15 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w14,w14,w6 + // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) + eor w6,w12,w13 + eor w9,w14,w8 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w15,w15,w6 + subs w11,w11,#1 + b.ne 10b + mov v5.s[0],w15 + mov v5.s[1],w14 + mov v5.s[2],w13 + mov v5.s[3],w12 +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif + eor v6.16b,v6.16b,v5.16b + mov x10,x3 + mov w11,#8 + mov w12,v6.s[0] + mov w13,v6.s[1] + mov w14,v6.s[2] + mov w15,v6.s[3] +10: + ldp w7,w8,[x10],8 + // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) + eor w6,w14,w15 + eor w9,w7,w13 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w12,w12,w6 + // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) + eor w6,w14,w15 + eor w9,w12,w8 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + ldp w7,w8,[x10],8 + eor w13,w13,w6 + // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) + eor w6,w12,w13 + eor w9,w7,w15 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w14,w14,w6 + // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) + eor w6,w12,w13 + eor w9,w14,w8 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w15,w15,w6 + subs w11,w11,#1 + b.ne 10b + mov v6.s[0],w15 + mov v6.s[1],w14 + mov v6.s[2],w13 + mov v6.s[3],w12 +#ifndef __AARCH64EB__ + rev32 v5.16b,v5.16b +#endif + eor v7.16b,v7.16b,v6.16b + mov x10,x3 + mov w11,#8 + mov w12,v7.s[0] + mov w13,v7.s[1] + mov w14,v7.s[2] + mov w15,v7.s[3] +10: + ldp w7,w8,[x10],8 + // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) + eor w6,w14,w15 + eor w9,w7,w13 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w12,w12,w6 + // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) + eor w6,w14,w15 + eor w9,w12,w8 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + ldp w7,w8,[x10],8 + eor w13,w13,w6 + // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) + eor w6,w12,w13 + eor w9,w7,w15 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w14,w14,w6 + // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) + eor w6,w12,w13 + eor w9,w14,w8 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w15,w15,w6 + subs w11,w11,#1 + b.ne 10b + mov v7.s[0],w15 + mov v7.s[1],w14 + mov v7.s[2],w13 + mov v7.s[3],w12 +#ifndef __AARCH64EB__ + rev32 v6.16b,v6.16b +#endif +#ifndef __AARCH64EB__ + rev32 v7.16b,v7.16b +#endif + orr v3.16b,v7.16b,v7.16b + st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64 + subs w2,w2,#4 + b.ne .Lcbc_4_blocks_enc + b 2f +1: + subs w2,w2,#1 + b.lt 2f + ld1 {v4.4s},[x0],#16 + eor v3.16b,v3.16b,v4.16b +#ifndef __AARCH64EB__ + rev32 v3.16b,v3.16b +#endif + mov x10,x3 + mov w11,#8 + mov w12,v3.s[0] + mov w13,v3.s[1] + mov w14,v3.s[2] + mov w15,v3.s[3] +10: + ldp w7,w8,[x10],8 + // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) + eor w6,w14,w15 + eor w9,w7,w13 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w12,w12,w6 + // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) + eor w6,w14,w15 + eor w9,w12,w8 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + ldp w7,w8,[x10],8 + eor w13,w13,w6 + // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) + eor w6,w12,w13 + eor w9,w7,w15 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w14,w14,w6 + // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) + eor w6,w12,w13 + eor w9,w14,w8 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w15,w15,w6 + subs w11,w11,#1 + b.ne 10b + mov v3.s[0],w15 + mov v3.s[1],w14 + mov v3.s[2],w13 + mov v3.s[3],w12 +#ifndef __AARCH64EB__ + rev32 v3.16b,v3.16b +#endif + st1 {v3.4s},[x1],#16 + b 1b +2: + // save back IV + st1 {v3.4s},[x4] + ret + +.Ldec: + // decryption mode starts + AARCH64_SIGN_LINK_REGISTER + stp d8,d9,[sp,#-80]! + stp d10,d11,[sp,#16] + stp d12,d13,[sp,#32] + stp d14,d15,[sp,#48] + stp x29,x30,[sp,#64] +.Lcbc_8_blocks_dec: + cmp w2,#8 + b.lt 1f + ld4 {v4.4s,v5.4s,v6.4s,v7.4s},[x0] + add x10,x0,#64 + ld4 {v8.4s,v9.4s,v10.4s,v11.4s},[x10] +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif +#ifndef __AARCH64EB__ + rev32 v5.16b,v5.16b +#endif +#ifndef __AARCH64EB__ + rev32 v6.16b,v6.16b +#endif +#ifndef __AARCH64EB__ + rev32 v7.16b,v7.16b +#endif +#ifndef __AARCH64EB__ + rev32 v8.16b,v8.16b +#endif +#ifndef __AARCH64EB__ + rev32 v9.16b,v9.16b +#endif +#ifndef __AARCH64EB__ + rev32 v10.16b,v10.16b +#endif +#ifndef __AARCH64EB__ + rev32 v11.16b,v11.16b +#endif + bl _vpsm4_ex_enc_8blks + zip1 v8.4s,v0.4s,v1.4s + zip2 v9.4s,v0.4s,v1.4s + zip1 v10.4s,v2.4s,v3.4s + zip2 v11.4s,v2.4s,v3.4s + zip1 v0.2d,v8.2d,v10.2d + zip2 v1.2d,v8.2d,v10.2d + zip1 v2.2d,v9.2d,v11.2d + zip2 v3.2d,v9.2d,v11.2d + zip1 v8.4s,v4.4s,v5.4s + zip2 v9.4s,v4.4s,v5.4s + zip1 v10.4s,v6.4s,v7.4s + zip2 v11.4s,v6.4s,v7.4s + zip1 v4.2d,v8.2d,v10.2d + zip2 v5.2d,v8.2d,v10.2d + zip1 v6.2d,v9.2d,v11.2d + zip2 v7.2d,v9.2d,v11.2d + ld1 {v15.4s},[x4] + ld1 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64 + // note ivec1 and vtmpx[3] are reusing the same register + // care needs to be taken to avoid conflict + eor v0.16b,v0.16b,v15.16b + ld1 {v12.4s,v13.4s,v14.4s,v15.4s},[x0],#64 + eor v1.16b,v1.16b,v8.16b + eor v2.16b,v2.16b,v9.16b + eor v3.16b,v3.16b,v10.16b + // save back IV + st1 {v15.4s}, [x4] + eor v4.16b,v4.16b,v11.16b + eor v5.16b,v5.16b,v12.16b + eor v6.16b,v6.16b,v13.16b + eor v7.16b,v7.16b,v14.16b + st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64 + st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64 + subs w2,w2,#8 + b.gt .Lcbc_8_blocks_dec + b.eq 100f +1: + ld1 {v15.4s},[x4] +.Lcbc_4_blocks_dec: + cmp w2,#4 + b.lt 1f + ld4 {v4.4s,v5.4s,v6.4s,v7.4s},[x0] +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif +#ifndef __AARCH64EB__ + rev32 v5.16b,v5.16b +#endif +#ifndef __AARCH64EB__ + rev32 v6.16b,v6.16b +#endif +#ifndef __AARCH64EB__ + rev32 v7.16b,v7.16b +#endif + bl _vpsm4_ex_enc_4blks + ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64 + zip1 v8.4s,v0.4s,v1.4s + zip2 v9.4s,v0.4s,v1.4s + zip1 v10.4s,v2.4s,v3.4s + zip2 v11.4s,v2.4s,v3.4s + zip1 v0.2d,v8.2d,v10.2d + zip2 v1.2d,v8.2d,v10.2d + zip1 v2.2d,v9.2d,v11.2d + zip2 v3.2d,v9.2d,v11.2d + eor v0.16b,v0.16b,v15.16b + eor v1.16b,v1.16b,v4.16b + orr v15.16b,v7.16b,v7.16b + eor v2.16b,v2.16b,v5.16b + eor v3.16b,v3.16b,v6.16b + st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64 + subs w2,w2,#4 + b.gt .Lcbc_4_blocks_dec + // save back IV + st1 {v7.4s}, [x4] + b 100f +1: // last block + subs w2,w2,#1 + b.lt 100f + b.gt 1f + ld1 {v4.4s},[x0],#16 + // save back IV + st1 {v4.4s}, [x4] +#ifndef __AARCH64EB__ + rev32 v8.16b,v4.16b +#else + mov v8.16b,v4.16b +#endif + mov x10,x3 + mov w11,#8 + mov w12,v8.s[0] + mov w13,v8.s[1] + mov w14,v8.s[2] + mov w15,v8.s[3] +10: + ldp w7,w8,[x10],8 + // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) + eor w6,w14,w15 + eor w9,w7,w13 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w12,w12,w6 + // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) + eor w6,w14,w15 + eor w9,w12,w8 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + ldp w7,w8,[x10],8 + eor w13,w13,w6 + // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) + eor w6,w12,w13 + eor w9,w7,w15 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w14,w14,w6 + // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) + eor w6,w12,w13 + eor w9,w14,w8 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w15,w15,w6 + subs w11,w11,#1 + b.ne 10b + mov v8.s[0],w15 + mov v8.s[1],w14 + mov v8.s[2],w13 + mov v8.s[3],w12 +#ifndef __AARCH64EB__ + rev32 v8.16b,v8.16b +#endif + eor v8.16b,v8.16b,v15.16b + st1 {v8.4s},[x1],#16 + b 100f +1: // last two blocks + ld4 {v4.s,v5.s,v6.s,v7.s}[0],[x0] + add x10,x0,#16 + ld4 {v4.s,v5.s,v6.s,v7.s}[1],[x10],#16 + subs w2,w2,1 + b.gt 1f +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif +#ifndef __AARCH64EB__ + rev32 v5.16b,v5.16b +#endif +#ifndef __AARCH64EB__ + rev32 v6.16b,v6.16b +#endif +#ifndef __AARCH64EB__ + rev32 v7.16b,v7.16b +#endif + bl _vpsm4_ex_enc_4blks + ld1 {v4.4s,v5.4s},[x0],#32 + zip1 v8.4s,v0.4s,v1.4s + zip2 v9.4s,v0.4s,v1.4s + zip1 v10.4s,v2.4s,v3.4s + zip2 v11.4s,v2.4s,v3.4s + zip1 v0.2d,v8.2d,v10.2d + zip2 v1.2d,v8.2d,v10.2d + zip1 v2.2d,v9.2d,v11.2d + zip2 v3.2d,v9.2d,v11.2d + eor v0.16b,v0.16b,v15.16b + eor v1.16b,v1.16b,v4.16b + st1 {v0.4s,v1.4s},[x1],#32 + // save back IV + st1 {v5.4s}, [x4] + b 100f +1: // last 3 blocks + ld4 {v4.s,v5.s,v6.s,v7.s}[2],[x10] +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif +#ifndef __AARCH64EB__ + rev32 v5.16b,v5.16b +#endif +#ifndef __AARCH64EB__ + rev32 v6.16b,v6.16b +#endif +#ifndef __AARCH64EB__ + rev32 v7.16b,v7.16b +#endif + bl _vpsm4_ex_enc_4blks + ld1 {v4.4s,v5.4s,v6.4s},[x0],#48 + zip1 v8.4s,v0.4s,v1.4s + zip2 v9.4s,v0.4s,v1.4s + zip1 v10.4s,v2.4s,v3.4s + zip2 v11.4s,v2.4s,v3.4s + zip1 v0.2d,v8.2d,v10.2d + zip2 v1.2d,v8.2d,v10.2d + zip1 v2.2d,v9.2d,v11.2d + zip2 v3.2d,v9.2d,v11.2d + eor v0.16b,v0.16b,v15.16b + eor v1.16b,v1.16b,v4.16b + eor v2.16b,v2.16b,v5.16b + st1 {v0.4s,v1.4s,v2.4s},[x1],#48 + // save back IV + st1 {v6.4s}, [x4] +100: + ldp d10,d11,[sp,#16] + ldp d12,d13,[sp,#32] + ldp d14,d15,[sp,#48] + ldp x29,x30,[sp,#64] + ldp d8,d9,[sp],#80 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size vpsm4_ex_cbc_encrypt,.-vpsm4_ex_cbc_encrypt +.globl vpsm4_ex_ctr32_encrypt_blocks +.type vpsm4_ex_ctr32_encrypt_blocks,%function +.align 5 +vpsm4_ex_ctr32_encrypt_blocks: + AARCH64_VALID_CALL_TARGET + ld1 {v3.4s},[x4] +#ifndef __AARCH64EB__ + rev32 v3.16b,v3.16b +#endif + adrp x9, .Lsbox_magic + ldr q26, [x9, #:lo12:.Lsbox_magic] + ldr q27, [x9, #:lo12:.Lsbox_magic+16] + ldr q28, [x9, #:lo12:.Lsbox_magic+32] + ldr q29, [x9, #:lo12:.Lsbox_magic+48] + ldr q30, [x9, #:lo12:.Lsbox_magic+64] + ldr q31, [x9, #:lo12:.Lsbox_magic+80] + cmp w2,#1 + b.ne 1f + // fast processing for one single block without + // context saving overhead + mov x10,x3 + mov w11,#8 + mov w12,v3.s[0] + mov w13,v3.s[1] + mov w14,v3.s[2] + mov w15,v3.s[3] +10: + ldp w7,w8,[x10],8 + // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) + eor w6,w14,w15 + eor w9,w7,w13 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w12,w12,w6 + // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) + eor w6,w14,w15 + eor w9,w12,w8 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + ldp w7,w8,[x10],8 + eor w13,w13,w6 + // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) + eor w6,w12,w13 + eor w9,w7,w15 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w14,w14,w6 + // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) + eor w6,w12,w13 + eor w9,w14,w8 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w15,w15,w6 + subs w11,w11,#1 + b.ne 10b + mov v3.s[0],w15 + mov v3.s[1],w14 + mov v3.s[2],w13 + mov v3.s[3],w12 +#ifndef __AARCH64EB__ + rev32 v3.16b,v3.16b +#endif + ld1 {v4.4s},[x0] + eor v4.16b,v4.16b,v3.16b + st1 {v4.4s},[x1] + ret +1: + AARCH64_SIGN_LINK_REGISTER + stp d8,d9,[sp,#-80]! + stp d10,d11,[sp,#16] + stp d12,d13,[sp,#32] + stp d14,d15,[sp,#48] + stp x29,x30,[sp,#64] + mov w12,v3.s[0] + mov w13,v3.s[1] + mov w14,v3.s[2] + mov w5,v3.s[3] +.Lctr32_4_blocks_process: + cmp w2,#4 + b.lt 1f + dup v4.4s,w12 + dup v5.4s,w13 + dup v6.4s,w14 + mov v7.s[0],w5 + add w5,w5,#1 + mov v7.s[1],w5 + add w5,w5,#1 + mov v7.s[2],w5 + add w5,w5,#1 + mov v7.s[3],w5 + add w5,w5,#1 + cmp w2,#8 + b.ge .Lctr32_8_blocks_process + bl _vpsm4_ex_enc_4blks + ld4 {v12.4s,v13.4s,v14.4s,v15.4s},[x0],#64 + eor v0.16b,v0.16b,v12.16b + eor v1.16b,v1.16b,v13.16b + eor v2.16b,v2.16b,v14.16b + eor v3.16b,v3.16b,v15.16b + st4 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64 + subs w2,w2,#4 + b.ne .Lctr32_4_blocks_process + b 100f +.Lctr32_8_blocks_process: + dup v8.4s,w12 + dup v9.4s,w13 + dup v10.4s,w14 + mov v11.s[0],w5 + add w5,w5,#1 + mov v11.s[1],w5 + add w5,w5,#1 + mov v11.s[2],w5 + add w5,w5,#1 + mov v11.s[3],w5 + add w5,w5,#1 + bl _vpsm4_ex_enc_8blks + ld4 {v12.4s,v13.4s,v14.4s,v15.4s},[x0],#64 + ld4 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64 + eor v0.16b,v0.16b,v12.16b + eor v1.16b,v1.16b,v13.16b + eor v2.16b,v2.16b,v14.16b + eor v3.16b,v3.16b,v15.16b + eor v4.16b,v4.16b,v8.16b + eor v5.16b,v5.16b,v9.16b + eor v6.16b,v6.16b,v10.16b + eor v7.16b,v7.16b,v11.16b + st4 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64 + st4 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64 + subs w2,w2,#8 + b.ne .Lctr32_4_blocks_process + b 100f +1: // last block processing + subs w2,w2,#1 + b.lt 100f + b.gt 1f + mov v3.s[0],w12 + mov v3.s[1],w13 + mov v3.s[2],w14 + mov v3.s[3],w5 + mov x10,x3 + mov w11,#8 + mov w12,v3.s[0] + mov w13,v3.s[1] + mov w14,v3.s[2] + mov w15,v3.s[3] +10: + ldp w7,w8,[x10],8 + // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) + eor w6,w14,w15 + eor w9,w7,w13 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w12,w12,w6 + // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) + eor w6,w14,w15 + eor w9,w12,w8 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + ldp w7,w8,[x10],8 + eor w13,w13,w6 + // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) + eor w6,w12,w13 + eor w9,w7,w15 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w14,w14,w6 + // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) + eor w6,w12,w13 + eor w9,w14,w8 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w15,w15,w6 + subs w11,w11,#1 + b.ne 10b + mov v3.s[0],w15 + mov v3.s[1],w14 + mov v3.s[2],w13 + mov v3.s[3],w12 +#ifndef __AARCH64EB__ + rev32 v3.16b,v3.16b +#endif + ld1 {v4.4s},[x0] + eor v4.16b,v4.16b,v3.16b + st1 {v4.4s},[x1] + b 100f +1: // last 2 blocks processing + dup v4.4s,w12 + dup v5.4s,w13 + dup v6.4s,w14 + mov v7.s[0],w5 + add w5,w5,#1 + mov v7.s[1],w5 + subs w2,w2,#1 + b.ne 1f + bl _vpsm4_ex_enc_4blks + ld4 {v12.s,v13.s,v14.s,v15.s}[0],[x0],#16 + ld4 {v12.s,v13.s,v14.s,v15.s}[1],[x0],#16 + eor v0.16b,v0.16b,v12.16b + eor v1.16b,v1.16b,v13.16b + eor v2.16b,v2.16b,v14.16b + eor v3.16b,v3.16b,v15.16b + st4 {v0.s,v1.s,v2.s,v3.s}[0],[x1],#16 + st4 {v0.s,v1.s,v2.s,v3.s}[1],[x1],#16 + b 100f +1: // last 3 blocks processing + add w5,w5,#1 + mov v7.s[2],w5 + bl _vpsm4_ex_enc_4blks + ld4 {v12.s,v13.s,v14.s,v15.s}[0],[x0],#16 + ld4 {v12.s,v13.s,v14.s,v15.s}[1],[x0],#16 + ld4 {v12.s,v13.s,v14.s,v15.s}[2],[x0],#16 + eor v0.16b,v0.16b,v12.16b + eor v1.16b,v1.16b,v13.16b + eor v2.16b,v2.16b,v14.16b + eor v3.16b,v3.16b,v15.16b + st4 {v0.s,v1.s,v2.s,v3.s}[0],[x1],#16 + st4 {v0.s,v1.s,v2.s,v3.s}[1],[x1],#16 + st4 {v0.s,v1.s,v2.s,v3.s}[2],[x1],#16 +100: + ldp d10,d11,[sp,#16] + ldp d12,d13,[sp,#32] + ldp d14,d15,[sp,#48] + ldp x29,x30,[sp,#64] + ldp d8,d9,[sp],#80 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size vpsm4_ex_ctr32_encrypt_blocks,.-vpsm4_ex_ctr32_encrypt_blocks +.globl vpsm4_ex_xts_encrypt_gb +.type vpsm4_ex_xts_encrypt_gb,%function +.align 5 +vpsm4_ex_xts_encrypt_gb: + AARCH64_SIGN_LINK_REGISTER + stp x15, x16, [sp, #-0x10]! + stp x17, x18, [sp, #-0x10]! + stp x19, x20, [sp, #-0x10]! + stp x21, x22, [sp, #-0x10]! + stp x23, x24, [sp, #-0x10]! + stp x25, x26, [sp, #-0x10]! + stp x27, x28, [sp, #-0x10]! + stp x29, x30, [sp, #-0x10]! + stp d8, d9, [sp, #-0x10]! + stp d10, d11, [sp, #-0x10]! + stp d12, d13, [sp, #-0x10]! + stp d14, d15, [sp, #-0x10]! + mov x26,x3 + mov x27,x4 + mov w28,w6 + ld1 {v16.4s}, [x5] + mov x3,x27 + adrp x9, .Lsbox_magic + ldr q26, [x9, #:lo12:.Lsbox_magic] + ldr q27, [x9, #:lo12:.Lsbox_magic+16] + ldr q28, [x9, #:lo12:.Lsbox_magic+32] + ldr q29, [x9, #:lo12:.Lsbox_magic+48] + ldr q30, [x9, #:lo12:.Lsbox_magic+64] + ldr q31, [x9, #:lo12:.Lsbox_magic+80] +#ifndef __AARCH64EB__ + rev32 v16.16b,v16.16b +#endif + mov x10,x3 + mov w11,#8 + mov w12,v16.s[0] + mov w13,v16.s[1] + mov w14,v16.s[2] + mov w15,v16.s[3] +10: + ldp w7,w8,[x10],8 + // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) + eor w6,w14,w15 + eor w9,w7,w13 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w12,w12,w6 + // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) + eor w6,w14,w15 + eor w9,w12,w8 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + ldp w7,w8,[x10],8 + eor w13,w13,w6 + // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) + eor w6,w12,w13 + eor w9,w7,w15 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w14,w14,w6 + // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) + eor w6,w12,w13 + eor w9,w14,w8 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w15,w15,w6 + subs w11,w11,#1 + b.ne 10b + mov v16.s[0],w15 + mov v16.s[1],w14 + mov v16.s[2],w13 + mov v16.s[3],w12 +#ifndef __AARCH64EB__ + rev32 v16.16b,v16.16b +#endif + mov x3,x26 + and x29,x2,#0x0F + // convert length into blocks + lsr x2,x2,4 + cmp x2,#1 + b.lt .return_gb + + cmp x29,0 + // If the encryption/decryption Length is N times of 16, + // the all blocks are encrypted/decrypted in .xts_encrypt_blocks_gb + b.eq .xts_encrypt_blocks_gb + + // If the encryption/decryption length is not N times of 16, + // the last two blocks are encrypted/decrypted in .last_2blks_tweak_gb or .only_2blks_tweak_gb + // the other blocks are encrypted/decrypted in .xts_encrypt_blocks_gb + subs x2,x2,#1 + b.eq .only_2blks_tweak_gb +.xts_encrypt_blocks_gb: + rbit v16.16b,v16.16b +#ifdef __AARCH64EB__ + rev32 v16.16b,v16.16b +#endif + mov x12,v16.d[0] + mov x13,v16.d[1] + mov w7,0x87 + extr x9,x13,x13,#32 + extr x15,x13,x12,#63 + and w8,w7,w9,asr#31 + eor x14,x8,x12,lsl#1 + mov w7,0x87 + extr x9,x15,x15,#32 + extr x17,x15,x14,#63 + and w8,w7,w9,asr#31 + eor x16,x8,x14,lsl#1 + mov w7,0x87 + extr x9,x17,x17,#32 + extr x19,x17,x16,#63 + and w8,w7,w9,asr#31 + eor x18,x8,x16,lsl#1 + mov w7,0x87 + extr x9,x19,x19,#32 + extr x21,x19,x18,#63 + and w8,w7,w9,asr#31 + eor x20,x8,x18,lsl#1 + mov w7,0x87 + extr x9,x21,x21,#32 + extr x23,x21,x20,#63 + and w8,w7,w9,asr#31 + eor x22,x8,x20,lsl#1 + mov w7,0x87 + extr x9,x23,x23,#32 + extr x25,x23,x22,#63 + and w8,w7,w9,asr#31 + eor x24,x8,x22,lsl#1 + mov w7,0x87 + extr x9,x25,x25,#32 + extr x27,x25,x24,#63 + and w8,w7,w9,asr#31 + eor x26,x8,x24,lsl#1 +.Lxts_8_blocks_process_gb: + cmp x2,#8 + mov v16.d[0],x12 + mov v16.d[1],x13 +#ifdef __AARCH64EB__ + rev32 v16.16b,v16.16b +#endif + mov w7,0x87 + extr x9,x27,x27,#32 + extr x13,x27,x26,#63 + and w8,w7,w9,asr#31 + eor x12,x8,x26,lsl#1 + mov v17.d[0],x14 + mov v17.d[1],x15 +#ifdef __AARCH64EB__ + rev32 v17.16b,v17.16b +#endif + mov w7,0x87 + extr x9,x13,x13,#32 + extr x15,x13,x12,#63 + and w8,w7,w9,asr#31 + eor x14,x8,x12,lsl#1 + mov v18.d[0],x16 + mov v18.d[1],x17 +#ifdef __AARCH64EB__ + rev32 v18.16b,v18.16b +#endif + mov w7,0x87 + extr x9,x15,x15,#32 + extr x17,x15,x14,#63 + and w8,w7,w9,asr#31 + eor x16,x8,x14,lsl#1 + mov v19.d[0],x18 + mov v19.d[1],x19 +#ifdef __AARCH64EB__ + rev32 v19.16b,v19.16b +#endif + mov w7,0x87 + extr x9,x17,x17,#32 + extr x19,x17,x16,#63 + and w8,w7,w9,asr#31 + eor x18,x8,x16,lsl#1 + mov v20.d[0],x20 + mov v20.d[1],x21 +#ifdef __AARCH64EB__ + rev32 v20.16b,v20.16b +#endif + mov w7,0x87 + extr x9,x19,x19,#32 + extr x21,x19,x18,#63 + and w8,w7,w9,asr#31 + eor x20,x8,x18,lsl#1 + mov v21.d[0],x22 + mov v21.d[1],x23 +#ifdef __AARCH64EB__ + rev32 v21.16b,v21.16b +#endif + mov w7,0x87 + extr x9,x21,x21,#32 + extr x23,x21,x20,#63 + and w8,w7,w9,asr#31 + eor x22,x8,x20,lsl#1 + mov v22.d[0],x24 + mov v22.d[1],x25 +#ifdef __AARCH64EB__ + rev32 v22.16b,v22.16b +#endif + mov w7,0x87 + extr x9,x23,x23,#32 + extr x25,x23,x22,#63 + and w8,w7,w9,asr#31 + eor x24,x8,x22,lsl#1 + mov v23.d[0],x26 + mov v23.d[1],x27 +#ifdef __AARCH64EB__ + rev32 v23.16b,v23.16b +#endif + mov w7,0x87 + extr x9,x25,x25,#32 + extr x27,x25,x24,#63 + and w8,w7,w9,asr#31 + eor x26,x8,x24,lsl#1 + b.lt .Lxts_4_blocks_process_gb + ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64 + rbit v16.16b,v16.16b + rbit v17.16b,v17.16b + rbit v18.16b,v18.16b + rbit v19.16b,v19.16b + eor v4.16b, v4.16b, v16.16b + eor v5.16b, v5.16b, v17.16b + eor v6.16b, v6.16b, v18.16b + eor v7.16b, v7.16b, v19.16b + ld1 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64 + rbit v20.16b,v20.16b + rbit v21.16b,v21.16b + rbit v22.16b,v22.16b + rbit v23.16b,v23.16b + eor v8.16b, v8.16b, v20.16b + eor v9.16b, v9.16b, v21.16b + eor v10.16b, v10.16b, v22.16b + eor v11.16b, v11.16b, v23.16b +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif +#ifndef __AARCH64EB__ + rev32 v5.16b,v5.16b +#endif +#ifndef __AARCH64EB__ + rev32 v6.16b,v6.16b +#endif +#ifndef __AARCH64EB__ + rev32 v7.16b,v7.16b +#endif +#ifndef __AARCH64EB__ + rev32 v8.16b,v8.16b +#endif +#ifndef __AARCH64EB__ + rev32 v9.16b,v9.16b +#endif +#ifndef __AARCH64EB__ + rev32 v10.16b,v10.16b +#endif +#ifndef __AARCH64EB__ + rev32 v11.16b,v11.16b +#endif + zip1 v0.4s,v4.4s,v5.4s + zip2 v1.4s,v4.4s,v5.4s + zip1 v2.4s,v6.4s,v7.4s + zip2 v3.4s,v6.4s,v7.4s + zip1 v4.2d,v0.2d,v2.2d + zip2 v5.2d,v0.2d,v2.2d + zip1 v6.2d,v1.2d,v3.2d + zip2 v7.2d,v1.2d,v3.2d + zip1 v0.4s,v8.4s,v9.4s + zip2 v1.4s,v8.4s,v9.4s + zip1 v2.4s,v10.4s,v11.4s + zip2 v3.4s,v10.4s,v11.4s + zip1 v8.2d,v0.2d,v2.2d + zip2 v9.2d,v0.2d,v2.2d + zip1 v10.2d,v1.2d,v3.2d + zip2 v11.2d,v1.2d,v3.2d + bl _vpsm4_ex_enc_8blks + zip1 v8.4s,v0.4s,v1.4s + zip2 v9.4s,v0.4s,v1.4s + zip1 v10.4s,v2.4s,v3.4s + zip2 v11.4s,v2.4s,v3.4s + zip1 v0.2d,v8.2d,v10.2d + zip2 v1.2d,v8.2d,v10.2d + zip1 v2.2d,v9.2d,v11.2d + zip2 v3.2d,v9.2d,v11.2d + zip1 v8.4s,v4.4s,v5.4s + zip2 v9.4s,v4.4s,v5.4s + zip1 v10.4s,v6.4s,v7.4s + zip2 v11.4s,v6.4s,v7.4s + zip1 v4.2d,v8.2d,v10.2d + zip2 v5.2d,v8.2d,v10.2d + zip1 v6.2d,v9.2d,v11.2d + zip2 v7.2d,v9.2d,v11.2d + eor v0.16b, v0.16b, v16.16b + eor v1.16b, v1.16b, v17.16b + eor v2.16b, v2.16b, v18.16b + eor v3.16b, v3.16b, v19.16b + eor v4.16b, v4.16b, v20.16b + eor v5.16b, v5.16b, v21.16b + eor v6.16b, v6.16b, v22.16b + eor v7.16b, v7.16b, v23.16b + + // save the last tweak + mov v25.16b,v23.16b + st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64 + st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64 + subs x2,x2,#8 + b.gt .Lxts_8_blocks_process_gb + b 100f +.Lxts_4_blocks_process_gb: + cmp x2,#4 + b.lt 1f + ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64 + rbit v16.16b,v16.16b + rbit v17.16b,v17.16b + rbit v18.16b,v18.16b + rbit v19.16b,v19.16b + eor v4.16b, v4.16b, v16.16b + eor v5.16b, v5.16b, v17.16b + eor v6.16b, v6.16b, v18.16b + eor v7.16b, v7.16b, v19.16b +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif +#ifndef __AARCH64EB__ + rev32 v5.16b,v5.16b +#endif +#ifndef __AARCH64EB__ + rev32 v6.16b,v6.16b +#endif +#ifndef __AARCH64EB__ + rev32 v7.16b,v7.16b +#endif + zip1 v0.4s,v4.4s,v5.4s + zip2 v1.4s,v4.4s,v5.4s + zip1 v2.4s,v6.4s,v7.4s + zip2 v3.4s,v6.4s,v7.4s + zip1 v4.2d,v0.2d,v2.2d + zip2 v5.2d,v0.2d,v2.2d + zip1 v6.2d,v1.2d,v3.2d + zip2 v7.2d,v1.2d,v3.2d + bl _vpsm4_ex_enc_4blks + zip1 v4.4s,v0.4s,v1.4s + zip2 v5.4s,v0.4s,v1.4s + zip1 v6.4s,v2.4s,v3.4s + zip2 v7.4s,v2.4s,v3.4s + zip1 v0.2d,v4.2d,v6.2d + zip2 v1.2d,v4.2d,v6.2d + zip1 v2.2d,v5.2d,v7.2d + zip2 v3.2d,v5.2d,v7.2d + eor v0.16b, v0.16b, v16.16b + eor v1.16b, v1.16b, v17.16b + eor v2.16b, v2.16b, v18.16b + eor v3.16b, v3.16b, v19.16b + st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64 + sub x2,x2,#4 + mov v16.16b,v20.16b + mov v17.16b,v21.16b + mov v18.16b,v22.16b + // save the last tweak + mov v25.16b,v19.16b +1: + // process last block + cmp x2,#1 + b.lt 100f + b.gt 1f + ld1 {v4.4s},[x0],#16 + rbit v16.16b,v16.16b + eor v4.16b, v4.16b, v16.16b +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif + mov x10,x3 + mov w11,#8 + mov w12,v4.s[0] + mov w13,v4.s[1] + mov w14,v4.s[2] + mov w15,v4.s[3] +10: + ldp w7,w8,[x10],8 + // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) + eor w6,w14,w15 + eor w9,w7,w13 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w12,w12,w6 + // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) + eor w6,w14,w15 + eor w9,w12,w8 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + ldp w7,w8,[x10],8 + eor w13,w13,w6 + // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) + eor w6,w12,w13 + eor w9,w7,w15 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w14,w14,w6 + // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) + eor w6,w12,w13 + eor w9,w14,w8 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w15,w15,w6 + subs w11,w11,#1 + b.ne 10b + mov v4.s[0],w15 + mov v4.s[1],w14 + mov v4.s[2],w13 + mov v4.s[3],w12 +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif + eor v4.16b, v4.16b, v16.16b + st1 {v4.4s},[x1],#16 + // save the last tweak + mov v25.16b,v16.16b + b 100f +1: // process last 2 blocks + cmp x2,#2 + b.gt 1f + ld1 {v4.4s,v5.4s},[x0],#32 + rbit v16.16b,v16.16b + rbit v17.16b,v17.16b + eor v4.16b, v4.16b, v16.16b + eor v5.16b, v5.16b, v17.16b +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif +#ifndef __AARCH64EB__ + rev32 v5.16b,v5.16b +#endif + zip1 v0.4s,v4.4s,v5.4s + zip2 v1.4s,v4.4s,v5.4s + zip1 v2.4s,v6.4s,v7.4s + zip2 v3.4s,v6.4s,v7.4s + zip1 v4.2d,v0.2d,v2.2d + zip2 v5.2d,v0.2d,v2.2d + zip1 v6.2d,v1.2d,v3.2d + zip2 v7.2d,v1.2d,v3.2d + bl _vpsm4_ex_enc_4blks + zip1 v4.4s,v0.4s,v1.4s + zip2 v5.4s,v0.4s,v1.4s + zip1 v6.4s,v2.4s,v3.4s + zip2 v7.4s,v2.4s,v3.4s + zip1 v0.2d,v4.2d,v6.2d + zip2 v1.2d,v4.2d,v6.2d + zip1 v2.2d,v5.2d,v7.2d + zip2 v3.2d,v5.2d,v7.2d + eor v0.16b, v0.16b, v16.16b + eor v1.16b, v1.16b, v17.16b + st1 {v0.4s,v1.4s},[x1],#32 + // save the last tweak + mov v25.16b,v17.16b + b 100f +1: // process last 3 blocks + ld1 {v4.4s,v5.4s,v6.4s},[x0],#48 + rbit v16.16b,v16.16b + rbit v17.16b,v17.16b + rbit v18.16b,v18.16b + eor v4.16b, v4.16b, v16.16b + eor v5.16b, v5.16b, v17.16b + eor v6.16b, v6.16b, v18.16b +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif +#ifndef __AARCH64EB__ + rev32 v5.16b,v5.16b +#endif +#ifndef __AARCH64EB__ + rev32 v6.16b,v6.16b +#endif + zip1 v0.4s,v4.4s,v5.4s + zip2 v1.4s,v4.4s,v5.4s + zip1 v2.4s,v6.4s,v7.4s + zip2 v3.4s,v6.4s,v7.4s + zip1 v4.2d,v0.2d,v2.2d + zip2 v5.2d,v0.2d,v2.2d + zip1 v6.2d,v1.2d,v3.2d + zip2 v7.2d,v1.2d,v3.2d + bl _vpsm4_ex_enc_4blks + zip1 v4.4s,v0.4s,v1.4s + zip2 v5.4s,v0.4s,v1.4s + zip1 v6.4s,v2.4s,v3.4s + zip2 v7.4s,v2.4s,v3.4s + zip1 v0.2d,v4.2d,v6.2d + zip2 v1.2d,v4.2d,v6.2d + zip1 v2.2d,v5.2d,v7.2d + zip2 v3.2d,v5.2d,v7.2d + eor v0.16b, v0.16b, v16.16b + eor v1.16b, v1.16b, v17.16b + eor v2.16b, v2.16b, v18.16b + st1 {v0.4s,v1.4s,v2.4s},[x1],#48 + // save the last tweak + mov v25.16b,v18.16b +100: + cmp x29,0 + b.eq .return_gb + +// This branch calculates the last two tweaks, +// while the encryption/decryption length is larger than 32 +.last_2blks_tweak_gb: +#ifdef __AARCH64EB__ + rev32 v25.16b,v25.16b +#endif + rbit v2.16b,v25.16b + adrp x9, .Lxts_magic + ldr q0, [x9, #:lo12:.Lxts_magic] + shl v17.16b, v2.16b, #1 + ext v1.16b, v2.16b, v2.16b,#15 + ushr v1.16b, v1.16b, #7 + mul v1.16b, v1.16b, v0.16b + eor v17.16b, v17.16b, v1.16b + rbit v17.16b,v17.16b + rbit v2.16b,v17.16b + adrp x9, .Lxts_magic + ldr q0, [x9, #:lo12:.Lxts_magic] + shl v18.16b, v2.16b, #1 + ext v1.16b, v2.16b, v2.16b,#15 + ushr v1.16b, v1.16b, #7 + mul v1.16b, v1.16b, v0.16b + eor v18.16b, v18.16b, v1.16b + rbit v18.16b,v18.16b + b .check_dec_gb + + +// This branch calculates the last two tweaks, +// while the encryption/decryption length is equal to 32, who only need two tweaks +.only_2blks_tweak_gb: + mov v17.16b,v16.16b +#ifdef __AARCH64EB__ + rev32 v17.16b,v17.16b +#endif + rbit v2.16b,v17.16b + adrp x9, .Lxts_magic + ldr q0, [x9, #:lo12:.Lxts_magic] + shl v18.16b, v2.16b, #1 + ext v1.16b, v2.16b, v2.16b,#15 + ushr v1.16b, v1.16b, #7 + mul v1.16b, v1.16b, v0.16b + eor v18.16b, v18.16b, v1.16b + rbit v18.16b,v18.16b + b .check_dec_gb + + +// Determine whether encryption or decryption is required. +// The last two tweaks need to be swapped for decryption. +.check_dec_gb: + // encryption:1 decryption:0 + cmp w28,1 + b.eq .process_last_2blks_gb + mov v0.16B,v17.16b + mov v17.16B,v18.16b + mov v18.16B,v0.16b + +.process_last_2blks_gb: +#ifdef __AARCH64EB__ + rev32 v17.16b,v17.16b +#endif +#ifdef __AARCH64EB__ + rev32 v18.16b,v18.16b +#endif + ld1 {v4.4s},[x0],#16 + eor v4.16b, v4.16b, v17.16b +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif + mov x10,x3 + mov w11,#8 + mov w12,v4.s[0] + mov w13,v4.s[1] + mov w14,v4.s[2] + mov w15,v4.s[3] +10: + ldp w7,w8,[x10],8 + // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) + eor w6,w14,w15 + eor w9,w7,w13 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w12,w12,w6 + // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) + eor w6,w14,w15 + eor w9,w12,w8 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + ldp w7,w8,[x10],8 + eor w13,w13,w6 + // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) + eor w6,w12,w13 + eor w9,w7,w15 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w14,w14,w6 + // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) + eor w6,w12,w13 + eor w9,w14,w8 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w15,w15,w6 + subs w11,w11,#1 + b.ne 10b + mov v4.s[0],w15 + mov v4.s[1],w14 + mov v4.s[2],w13 + mov v4.s[3],w12 +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif + eor v4.16b, v4.16b, v17.16b + st1 {v4.4s},[x1],#16 + + sub x26,x1,16 +.loop_gb: + subs x29,x29,1 + ldrb w7,[x26,x29] + ldrb w8,[x0,x29] + strb w8,[x26,x29] + strb w7,[x1,x29] + b.gt .loop_gb + ld1 {v4.4s}, [x26] + eor v4.16b, v4.16b, v18.16b +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif + mov x10,x3 + mov w11,#8 + mov w12,v4.s[0] + mov w13,v4.s[1] + mov w14,v4.s[2] + mov w15,v4.s[3] +10: + ldp w7,w8,[x10],8 + // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) + eor w6,w14,w15 + eor w9,w7,w13 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w12,w12,w6 + // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) + eor w6,w14,w15 + eor w9,w12,w8 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + ldp w7,w8,[x10],8 + eor w13,w13,w6 + // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) + eor w6,w12,w13 + eor w9,w7,w15 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w14,w14,w6 + // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) + eor w6,w12,w13 + eor w9,w14,w8 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w15,w15,w6 + subs w11,w11,#1 + b.ne 10b + mov v4.s[0],w15 + mov v4.s[1],w14 + mov v4.s[2],w13 + mov v4.s[3],w12 +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif + eor v4.16b, v4.16b, v18.16b + st1 {v4.4s}, [x26] +.return_gb: + ldp d14, d15, [sp], #0x10 + ldp d12, d13, [sp], #0x10 + ldp d10, d11, [sp], #0x10 + ldp d8, d9, [sp], #0x10 + ldp x29, x30, [sp], #0x10 + ldp x27, x28, [sp], #0x10 + ldp x25, x26, [sp], #0x10 + ldp x23, x24, [sp], #0x10 + ldp x21, x22, [sp], #0x10 + ldp x19, x20, [sp], #0x10 + ldp x17, x18, [sp], #0x10 + ldp x15, x16, [sp], #0x10 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size vpsm4_ex_xts_encrypt_gb,.-vpsm4_ex_xts_encrypt_gb +.globl vpsm4_ex_xts_encrypt +.type vpsm4_ex_xts_encrypt,%function +.align 5 +vpsm4_ex_xts_encrypt: + AARCH64_SIGN_LINK_REGISTER + stp x15, x16, [sp, #-0x10]! + stp x17, x18, [sp, #-0x10]! + stp x19, x20, [sp, #-0x10]! + stp x21, x22, [sp, #-0x10]! + stp x23, x24, [sp, #-0x10]! + stp x25, x26, [sp, #-0x10]! + stp x27, x28, [sp, #-0x10]! + stp x29, x30, [sp, #-0x10]! + stp d8, d9, [sp, #-0x10]! + stp d10, d11, [sp, #-0x10]! + stp d12, d13, [sp, #-0x10]! + stp d14, d15, [sp, #-0x10]! + mov x26,x3 + mov x27,x4 + mov w28,w6 + ld1 {v16.4s}, [x5] + mov x3,x27 + adrp x9, .Lsbox_magic + ldr q26, [x9, #:lo12:.Lsbox_magic] + ldr q27, [x9, #:lo12:.Lsbox_magic+16] + ldr q28, [x9, #:lo12:.Lsbox_magic+32] + ldr q29, [x9, #:lo12:.Lsbox_magic+48] + ldr q30, [x9, #:lo12:.Lsbox_magic+64] + ldr q31, [x9, #:lo12:.Lsbox_magic+80] +#ifndef __AARCH64EB__ + rev32 v16.16b,v16.16b +#endif + mov x10,x3 + mov w11,#8 + mov w12,v16.s[0] + mov w13,v16.s[1] + mov w14,v16.s[2] + mov w15,v16.s[3] +10: + ldp w7,w8,[x10],8 + // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) + eor w6,w14,w15 + eor w9,w7,w13 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w12,w12,w6 + // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) + eor w6,w14,w15 + eor w9,w12,w8 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + ldp w7,w8,[x10],8 + eor w13,w13,w6 + // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) + eor w6,w12,w13 + eor w9,w7,w15 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w14,w14,w6 + // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) + eor w6,w12,w13 + eor w9,w14,w8 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w15,w15,w6 + subs w11,w11,#1 + b.ne 10b + mov v16.s[0],w15 + mov v16.s[1],w14 + mov v16.s[2],w13 + mov v16.s[3],w12 +#ifndef __AARCH64EB__ + rev32 v16.16b,v16.16b +#endif + mov x3,x26 + and x29,x2,#0x0F + // convert length into blocks + lsr x2,x2,4 + cmp x2,#1 + b.lt .return + + cmp x29,0 + // If the encryption/decryption Length is N times of 16, + // the all blocks are encrypted/decrypted in .xts_encrypt_blocks + b.eq .xts_encrypt_blocks + + // If the encryption/decryption length is not N times of 16, + // the last two blocks are encrypted/decrypted in .last_2blks_tweak or .only_2blks_tweak + // the other blocks are encrypted/decrypted in .xts_encrypt_blocks + subs x2,x2,#1 + b.eq .only_2blks_tweak +.xts_encrypt_blocks: +#ifdef __AARCH64EB__ + rev32 v16.16b,v16.16b +#endif + mov x12,v16.d[0] + mov x13,v16.d[1] + mov w7,0x87 + extr x9,x13,x13,#32 + extr x15,x13,x12,#63 + and w8,w7,w9,asr#31 + eor x14,x8,x12,lsl#1 + mov w7,0x87 + extr x9,x15,x15,#32 + extr x17,x15,x14,#63 + and w8,w7,w9,asr#31 + eor x16,x8,x14,lsl#1 + mov w7,0x87 + extr x9,x17,x17,#32 + extr x19,x17,x16,#63 + and w8,w7,w9,asr#31 + eor x18,x8,x16,lsl#1 + mov w7,0x87 + extr x9,x19,x19,#32 + extr x21,x19,x18,#63 + and w8,w7,w9,asr#31 + eor x20,x8,x18,lsl#1 + mov w7,0x87 + extr x9,x21,x21,#32 + extr x23,x21,x20,#63 + and w8,w7,w9,asr#31 + eor x22,x8,x20,lsl#1 + mov w7,0x87 + extr x9,x23,x23,#32 + extr x25,x23,x22,#63 + and w8,w7,w9,asr#31 + eor x24,x8,x22,lsl#1 + mov w7,0x87 + extr x9,x25,x25,#32 + extr x27,x25,x24,#63 + and w8,w7,w9,asr#31 + eor x26,x8,x24,lsl#1 +.Lxts_8_blocks_process: + cmp x2,#8 + mov v16.d[0],x12 + mov v16.d[1],x13 +#ifdef __AARCH64EB__ + rev32 v16.16b,v16.16b +#endif + mov w7,0x87 + extr x9,x27,x27,#32 + extr x13,x27,x26,#63 + and w8,w7,w9,asr#31 + eor x12,x8,x26,lsl#1 + mov v17.d[0],x14 + mov v17.d[1],x15 +#ifdef __AARCH64EB__ + rev32 v17.16b,v17.16b +#endif + mov w7,0x87 + extr x9,x13,x13,#32 + extr x15,x13,x12,#63 + and w8,w7,w9,asr#31 + eor x14,x8,x12,lsl#1 + mov v18.d[0],x16 + mov v18.d[1],x17 +#ifdef __AARCH64EB__ + rev32 v18.16b,v18.16b +#endif + mov w7,0x87 + extr x9,x15,x15,#32 + extr x17,x15,x14,#63 + and w8,w7,w9,asr#31 + eor x16,x8,x14,lsl#1 + mov v19.d[0],x18 + mov v19.d[1],x19 +#ifdef __AARCH64EB__ + rev32 v19.16b,v19.16b +#endif + mov w7,0x87 + extr x9,x17,x17,#32 + extr x19,x17,x16,#63 + and w8,w7,w9,asr#31 + eor x18,x8,x16,lsl#1 + mov v20.d[0],x20 + mov v20.d[1],x21 +#ifdef __AARCH64EB__ + rev32 v20.16b,v20.16b +#endif + mov w7,0x87 + extr x9,x19,x19,#32 + extr x21,x19,x18,#63 + and w8,w7,w9,asr#31 + eor x20,x8,x18,lsl#1 + mov v21.d[0],x22 + mov v21.d[1],x23 +#ifdef __AARCH64EB__ + rev32 v21.16b,v21.16b +#endif + mov w7,0x87 + extr x9,x21,x21,#32 + extr x23,x21,x20,#63 + and w8,w7,w9,asr#31 + eor x22,x8,x20,lsl#1 + mov v22.d[0],x24 + mov v22.d[1],x25 +#ifdef __AARCH64EB__ + rev32 v22.16b,v22.16b +#endif + mov w7,0x87 + extr x9,x23,x23,#32 + extr x25,x23,x22,#63 + and w8,w7,w9,asr#31 + eor x24,x8,x22,lsl#1 + mov v23.d[0],x26 + mov v23.d[1],x27 +#ifdef __AARCH64EB__ + rev32 v23.16b,v23.16b +#endif + mov w7,0x87 + extr x9,x25,x25,#32 + extr x27,x25,x24,#63 + and w8,w7,w9,asr#31 + eor x26,x8,x24,lsl#1 + b.lt .Lxts_4_blocks_process + ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64 + eor v4.16b, v4.16b, v16.16b + eor v5.16b, v5.16b, v17.16b + eor v6.16b, v6.16b, v18.16b + eor v7.16b, v7.16b, v19.16b + ld1 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64 + eor v8.16b, v8.16b, v20.16b + eor v9.16b, v9.16b, v21.16b + eor v10.16b, v10.16b, v22.16b + eor v11.16b, v11.16b, v23.16b +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif +#ifndef __AARCH64EB__ + rev32 v5.16b,v5.16b +#endif +#ifndef __AARCH64EB__ + rev32 v6.16b,v6.16b +#endif +#ifndef __AARCH64EB__ + rev32 v7.16b,v7.16b +#endif +#ifndef __AARCH64EB__ + rev32 v8.16b,v8.16b +#endif +#ifndef __AARCH64EB__ + rev32 v9.16b,v9.16b +#endif +#ifndef __AARCH64EB__ + rev32 v10.16b,v10.16b +#endif +#ifndef __AARCH64EB__ + rev32 v11.16b,v11.16b +#endif + zip1 v0.4s,v4.4s,v5.4s + zip2 v1.4s,v4.4s,v5.4s + zip1 v2.4s,v6.4s,v7.4s + zip2 v3.4s,v6.4s,v7.4s + zip1 v4.2d,v0.2d,v2.2d + zip2 v5.2d,v0.2d,v2.2d + zip1 v6.2d,v1.2d,v3.2d + zip2 v7.2d,v1.2d,v3.2d + zip1 v0.4s,v8.4s,v9.4s + zip2 v1.4s,v8.4s,v9.4s + zip1 v2.4s,v10.4s,v11.4s + zip2 v3.4s,v10.4s,v11.4s + zip1 v8.2d,v0.2d,v2.2d + zip2 v9.2d,v0.2d,v2.2d + zip1 v10.2d,v1.2d,v3.2d + zip2 v11.2d,v1.2d,v3.2d + bl _vpsm4_ex_enc_8blks + zip1 v8.4s,v0.4s,v1.4s + zip2 v9.4s,v0.4s,v1.4s + zip1 v10.4s,v2.4s,v3.4s + zip2 v11.4s,v2.4s,v3.4s + zip1 v0.2d,v8.2d,v10.2d + zip2 v1.2d,v8.2d,v10.2d + zip1 v2.2d,v9.2d,v11.2d + zip2 v3.2d,v9.2d,v11.2d + zip1 v8.4s,v4.4s,v5.4s + zip2 v9.4s,v4.4s,v5.4s + zip1 v10.4s,v6.4s,v7.4s + zip2 v11.4s,v6.4s,v7.4s + zip1 v4.2d,v8.2d,v10.2d + zip2 v5.2d,v8.2d,v10.2d + zip1 v6.2d,v9.2d,v11.2d + zip2 v7.2d,v9.2d,v11.2d + eor v0.16b, v0.16b, v16.16b + eor v1.16b, v1.16b, v17.16b + eor v2.16b, v2.16b, v18.16b + eor v3.16b, v3.16b, v19.16b + eor v4.16b, v4.16b, v20.16b + eor v5.16b, v5.16b, v21.16b + eor v6.16b, v6.16b, v22.16b + eor v7.16b, v7.16b, v23.16b + + // save the last tweak + mov v25.16b,v23.16b + st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64 + st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64 + subs x2,x2,#8 + b.gt .Lxts_8_blocks_process + b 100f +.Lxts_4_blocks_process: + cmp x2,#4 + b.lt 1f + ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64 + eor v4.16b, v4.16b, v16.16b + eor v5.16b, v5.16b, v17.16b + eor v6.16b, v6.16b, v18.16b + eor v7.16b, v7.16b, v19.16b +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif +#ifndef __AARCH64EB__ + rev32 v5.16b,v5.16b +#endif +#ifndef __AARCH64EB__ + rev32 v6.16b,v6.16b +#endif +#ifndef __AARCH64EB__ + rev32 v7.16b,v7.16b +#endif + zip1 v0.4s,v4.4s,v5.4s + zip2 v1.4s,v4.4s,v5.4s + zip1 v2.4s,v6.4s,v7.4s + zip2 v3.4s,v6.4s,v7.4s + zip1 v4.2d,v0.2d,v2.2d + zip2 v5.2d,v0.2d,v2.2d + zip1 v6.2d,v1.2d,v3.2d + zip2 v7.2d,v1.2d,v3.2d + bl _vpsm4_ex_enc_4blks + zip1 v4.4s,v0.4s,v1.4s + zip2 v5.4s,v0.4s,v1.4s + zip1 v6.4s,v2.4s,v3.4s + zip2 v7.4s,v2.4s,v3.4s + zip1 v0.2d,v4.2d,v6.2d + zip2 v1.2d,v4.2d,v6.2d + zip1 v2.2d,v5.2d,v7.2d + zip2 v3.2d,v5.2d,v7.2d + eor v0.16b, v0.16b, v16.16b + eor v1.16b, v1.16b, v17.16b + eor v2.16b, v2.16b, v18.16b + eor v3.16b, v3.16b, v19.16b + st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64 + sub x2,x2,#4 + mov v16.16b,v20.16b + mov v17.16b,v21.16b + mov v18.16b,v22.16b + // save the last tweak + mov v25.16b,v19.16b +1: + // process last block + cmp x2,#1 + b.lt 100f + b.gt 1f + ld1 {v4.4s},[x0],#16 + eor v4.16b, v4.16b, v16.16b +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif + mov x10,x3 + mov w11,#8 + mov w12,v4.s[0] + mov w13,v4.s[1] + mov w14,v4.s[2] + mov w15,v4.s[3] +10: + ldp w7,w8,[x10],8 + // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) + eor w6,w14,w15 + eor w9,w7,w13 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w12,w12,w6 + // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) + eor w6,w14,w15 + eor w9,w12,w8 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + ldp w7,w8,[x10],8 + eor w13,w13,w6 + // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) + eor w6,w12,w13 + eor w9,w7,w15 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w14,w14,w6 + // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) + eor w6,w12,w13 + eor w9,w14,w8 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w15,w15,w6 + subs w11,w11,#1 + b.ne 10b + mov v4.s[0],w15 + mov v4.s[1],w14 + mov v4.s[2],w13 + mov v4.s[3],w12 +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif + eor v4.16b, v4.16b, v16.16b + st1 {v4.4s},[x1],#16 + // save the last tweak + mov v25.16b,v16.16b + b 100f +1: // process last 2 blocks + cmp x2,#2 + b.gt 1f + ld1 {v4.4s,v5.4s},[x0],#32 + eor v4.16b, v4.16b, v16.16b + eor v5.16b, v5.16b, v17.16b +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif +#ifndef __AARCH64EB__ + rev32 v5.16b,v5.16b +#endif + zip1 v0.4s,v4.4s,v5.4s + zip2 v1.4s,v4.4s,v5.4s + zip1 v2.4s,v6.4s,v7.4s + zip2 v3.4s,v6.4s,v7.4s + zip1 v4.2d,v0.2d,v2.2d + zip2 v5.2d,v0.2d,v2.2d + zip1 v6.2d,v1.2d,v3.2d + zip2 v7.2d,v1.2d,v3.2d + bl _vpsm4_ex_enc_4blks + zip1 v4.4s,v0.4s,v1.4s + zip2 v5.4s,v0.4s,v1.4s + zip1 v6.4s,v2.4s,v3.4s + zip2 v7.4s,v2.4s,v3.4s + zip1 v0.2d,v4.2d,v6.2d + zip2 v1.2d,v4.2d,v6.2d + zip1 v2.2d,v5.2d,v7.2d + zip2 v3.2d,v5.2d,v7.2d + eor v0.16b, v0.16b, v16.16b + eor v1.16b, v1.16b, v17.16b + st1 {v0.4s,v1.4s},[x1],#32 + // save the last tweak + mov v25.16b,v17.16b + b 100f +1: // process last 3 blocks + ld1 {v4.4s,v5.4s,v6.4s},[x0],#48 + eor v4.16b, v4.16b, v16.16b + eor v5.16b, v5.16b, v17.16b + eor v6.16b, v6.16b, v18.16b +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif +#ifndef __AARCH64EB__ + rev32 v5.16b,v5.16b +#endif +#ifndef __AARCH64EB__ + rev32 v6.16b,v6.16b +#endif + zip1 v0.4s,v4.4s,v5.4s + zip2 v1.4s,v4.4s,v5.4s + zip1 v2.4s,v6.4s,v7.4s + zip2 v3.4s,v6.4s,v7.4s + zip1 v4.2d,v0.2d,v2.2d + zip2 v5.2d,v0.2d,v2.2d + zip1 v6.2d,v1.2d,v3.2d + zip2 v7.2d,v1.2d,v3.2d + bl _vpsm4_ex_enc_4blks + zip1 v4.4s,v0.4s,v1.4s + zip2 v5.4s,v0.4s,v1.4s + zip1 v6.4s,v2.4s,v3.4s + zip2 v7.4s,v2.4s,v3.4s + zip1 v0.2d,v4.2d,v6.2d + zip2 v1.2d,v4.2d,v6.2d + zip1 v2.2d,v5.2d,v7.2d + zip2 v3.2d,v5.2d,v7.2d + eor v0.16b, v0.16b, v16.16b + eor v1.16b, v1.16b, v17.16b + eor v2.16b, v2.16b, v18.16b + st1 {v0.4s,v1.4s,v2.4s},[x1],#48 + // save the last tweak + mov v25.16b,v18.16b +100: + cmp x29,0 + b.eq .return + +// This branch calculates the last two tweaks, +// while the encryption/decryption length is larger than 32 +.last_2blks_tweak: +#ifdef __AARCH64EB__ + rev32 v25.16b,v25.16b +#endif + mov v2.16b,v25.16b + adrp x9, .Lxts_magic + ldr q0, [x9, #:lo12:.Lxts_magic] + shl v17.16b, v2.16b, #1 + ext v1.16b, v2.16b, v2.16b,#15 + ushr v1.16b, v1.16b, #7 + mul v1.16b, v1.16b, v0.16b + eor v17.16b, v17.16b, v1.16b + mov v2.16b,v17.16b + adrp x9, .Lxts_magic + ldr q0, [x9, #:lo12:.Lxts_magic] + shl v18.16b, v2.16b, #1 + ext v1.16b, v2.16b, v2.16b,#15 + ushr v1.16b, v1.16b, #7 + mul v1.16b, v1.16b, v0.16b + eor v18.16b, v18.16b, v1.16b + b .check_dec + + +// This branch calculates the last two tweaks, +// while the encryption/decryption length is equal to 32, who only need two tweaks +.only_2blks_tweak: + mov v17.16b,v16.16b +#ifdef __AARCH64EB__ + rev32 v17.16b,v17.16b +#endif + mov v2.16b,v17.16b + adrp x9, .Lxts_magic + ldr q0, [x9, #:lo12:.Lxts_magic] + shl v18.16b, v2.16b, #1 + ext v1.16b, v2.16b, v2.16b,#15 + ushr v1.16b, v1.16b, #7 + mul v1.16b, v1.16b, v0.16b + eor v18.16b, v18.16b, v1.16b + b .check_dec + + +// Determine whether encryption or decryption is required. +// The last two tweaks need to be swapped for decryption. +.check_dec: + // encryption:1 decryption:0 + cmp w28,1 + b.eq .process_last_2blks + mov v0.16B,v17.16b + mov v17.16B,v18.16b + mov v18.16B,v0.16b + +.process_last_2blks: +#ifdef __AARCH64EB__ + rev32 v17.16b,v17.16b +#endif +#ifdef __AARCH64EB__ + rev32 v18.16b,v18.16b +#endif + ld1 {v4.4s},[x0],#16 + eor v4.16b, v4.16b, v17.16b +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif + mov x10,x3 + mov w11,#8 + mov w12,v4.s[0] + mov w13,v4.s[1] + mov w14,v4.s[2] + mov w15,v4.s[3] +10: + ldp w7,w8,[x10],8 + // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) + eor w6,w14,w15 + eor w9,w7,w13 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w12,w12,w6 + // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) + eor w6,w14,w15 + eor w9,w12,w8 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + ldp w7,w8,[x10],8 + eor w13,w13,w6 + // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) + eor w6,w12,w13 + eor w9,w7,w15 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w14,w14,w6 + // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) + eor w6,w12,w13 + eor w9,w14,w8 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w15,w15,w6 + subs w11,w11,#1 + b.ne 10b + mov v4.s[0],w15 + mov v4.s[1],w14 + mov v4.s[2],w13 + mov v4.s[3],w12 +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif + eor v4.16b, v4.16b, v17.16b + st1 {v4.4s},[x1],#16 + + sub x26,x1,16 +.loop: + subs x29,x29,1 + ldrb w7,[x26,x29] + ldrb w8,[x0,x29] + strb w8,[x26,x29] + strb w7,[x1,x29] + b.gt .loop + ld1 {v4.4s}, [x26] + eor v4.16b, v4.16b, v18.16b +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif + mov x10,x3 + mov w11,#8 + mov w12,v4.s[0] + mov w13,v4.s[1] + mov w14,v4.s[2] + mov w15,v4.s[3] +10: + ldp w7,w8,[x10],8 + // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) + eor w6,w14,w15 + eor w9,w7,w13 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w12,w12,w6 + // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) + eor w6,w14,w15 + eor w9,w12,w8 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + ldp w7,w8,[x10],8 + eor w13,w13,w6 + // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) + eor w6,w12,w13 + eor w9,w7,w15 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w14,w14,w6 + // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) + eor w6,w12,w13 + eor w9,w14,w8 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w15,w15,w6 + subs w11,w11,#1 + b.ne 10b + mov v4.s[0],w15 + mov v4.s[1],w14 + mov v4.s[2],w13 + mov v4.s[3],w12 +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif + eor v4.16b, v4.16b, v18.16b + st1 {v4.4s}, [x26] +.return: + ldp d14, d15, [sp], #0x10 + ldp d12, d13, [sp], #0x10 + ldp d10, d11, [sp], #0x10 + ldp d8, d9, [sp], #0x10 + ldp x29, x30, [sp], #0x10 + ldp x27, x28, [sp], #0x10 + ldp x25, x26, [sp], #0x10 + ldp x23, x24, [sp], #0x10 + ldp x21, x22, [sp], #0x10 + ldp x19, x20, [sp], #0x10 + ldp x17, x18, [sp], #0x10 + ldp x15, x16, [sp], #0x10 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size vpsm4_ex_xts_encrypt,.-vpsm4_ex_xts_encrypt |