diff options
Diffstat (limited to 'sys/crypto/openssl/aarch64')
22 files changed, 28229 insertions, 89 deletions
diff --git a/sys/crypto/openssl/aarch64/aes-gcm-armv8-unroll8_64.S b/sys/crypto/openssl/aarch64/aes-gcm-armv8-unroll8_64.S new file mode 100644 index 000000000000..61e9326175d0 --- /dev/null +++ b/sys/crypto/openssl/aarch64/aes-gcm-armv8-unroll8_64.S @@ -0,0 +1,8488 @@ +/* Do not modify. This file is auto-generated from aes-gcm-armv8-unroll8_64.pl. */ +#include "arm_arch.h" + +#if __ARM_MAX_ARCH__>=8 +.arch armv8-a+crypto +.text +.globl unroll8_eor3_aes_gcm_enc_128_kernel +.type unroll8_eor3_aes_gcm_enc_128_kernel,%function +.align 4 +unroll8_eor3_aes_gcm_enc_128_kernel: + AARCH64_VALID_CALL_TARGET + cbz x1, .L128_enc_ret + stp d8, d9, [sp, #-80]! + lsr x9, x1, #3 + mov x16, x4 + mov x8, x5 + stp d10, d11, [sp, #16] + stp d12, d13, [sp, #32] + stp d14, d15, [sp, #48] + mov x5, #0xc200000000000000 + stp x5, xzr, [sp, #64] + add x10, sp, #64 + + mov x15, #0x100000000 //set up counter increment + movi v31.16b, #0x0 + mov v31.d[1], x15 + mov x5, x9 + ld1 { v0.16b}, [x16] //CTR block 0 + + sub x5, x5, #1 //byte_len - 1 + + and x5, x5, #0xffffffffffffff80 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail) + + rev32 v30.16b, v0.16b //set up reversed counter + + add v30.4s, v30.4s, v31.4s //CTR block 0 + + rev32 v1.16b, v30.16b //CTR block 1 + add v30.4s, v30.4s, v31.4s //CTR block 1 + + rev32 v2.16b, v30.16b //CTR block 2 + add v30.4s, v30.4s, v31.4s //CTR block 2 + + rev32 v3.16b, v30.16b //CTR block 3 + add v30.4s, v30.4s, v31.4s //CTR block 3 + + rev32 v4.16b, v30.16b //CTR block 4 + add v30.4s, v30.4s, v31.4s //CTR block 4 + + rev32 v5.16b, v30.16b //CTR block 5 + add v30.4s, v30.4s, v31.4s //CTR block 5 + ldp q26, q27, [x8, #0] //load rk0, rk1 + + rev32 v6.16b, v30.16b //CTR block 6 + add v30.4s, v30.4s, v31.4s //CTR block 6 + + rev32 v7.16b, v30.16b //CTR block 7 + add v30.4s, v30.4s, v31.4s //CTR block 7 + + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 4 - round 0 + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 6 - round 0 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 3 - round 0 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 0 - round 0 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 1 - round 0 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 2 - round 0 + + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 7 - round 0 + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 5 - round 0 + ldp q28, q26, [x8, #32] //load rk2, rk3 + + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 3 - round 1 + + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 7 - round 1 + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 5 - round 1 + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 4 - round 1 + + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 2 - round 1 + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 6 - round 1 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 0 - round 1 + + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 5 - round 2 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 1 - round 1 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 0 - round 2 + + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 2 - round 2 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 3 - round 2 + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 7 - round 2 + + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 1 - round 2 + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 6 - round 2 + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 4 - round 2 + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 2 - round 3 + + ldp q27, q28, [x8, #64] //load rk4, rk5 + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 5 - round 3 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 0 - round 3 + + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 4 - round 3 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 3 - round 3 + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 6 - round 3 + + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 7 - round 3 + + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 6 - round 4 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 1 - round 3 + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 5 - round 4 + + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 7 - round 4 + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 4 - round 4 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 0 - round 4 + + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 1 - round 4 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 2 - round 4 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 3 - round 4 + + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 7 - round 5 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 0 - round 5 + ldp q26, q27, [x8, #96] //load rk6, rk7 + + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 1 - round 5 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 3 - round 5 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 2 - round 5 + + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 4 - round 5 + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 5 - round 5 + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 6 - round 5 + + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 4 - round 6 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 3 - round 6 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 2 - round 6 + + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 7 - round 6 + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 6 - round 6 + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 5 - round 6 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 0 - round 6 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 1 - round 6 + ldp q28, q26, [x8, #128] //load rk8, rk9 + + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 5 - round 7 + + ld1 { v19.16b}, [x3] + ext v19.16b, v19.16b, v19.16b, #8 + rev64 v19.16b, v19.16b + + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 7 - round 7 + + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 4 - round 7 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 3 - round 7 + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 6 - round 7 + + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 1 - round 7 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 2 - round 7 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 0 - round 7 + + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 8 + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 8 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 8 + + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 8 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 8 + ldr q27, [x8, #160] //load rk10 + + aese v3.16b, v26.16b //AES block 8k+11 - round 9 + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 8 + aese v2.16b, v26.16b //AES block 8k+10 - round 9 + + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 8 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 8 + aese v6.16b, v26.16b //AES block 8k+14 - round 9 + + aese v4.16b, v26.16b //AES block 8k+12 - round 9 + add x5, x5, x0 + aese v0.16b, v26.16b //AES block 8k+8 - round 9 + + aese v7.16b, v26.16b //AES block 8k+15 - round 9 + aese v5.16b, v26.16b //AES block 8k+13 - round 9 + aese v1.16b, v26.16b //AES block 8k+9 - round 9 + + add x4, x0, x1, lsr #3 //end_input_ptr + cmp x0, x5 //check if we have <= 8 blocks + b.ge .L128_enc_tail //handle tail + + ldp q8, q9, [x0], #32 //AES block 0, 1 - load plaintext + + ldp q10, q11, [x0], #32 //AES block 2, 3 - load plaintext + + ldp q12, q13, [x0], #32 //AES block 4, 5 - load plaintext + + ldp q14, q15, [x0], #32 //AES block 6, 7 - load plaintext + cmp x0, x5 //check if we have <= 8 blocks + +.inst 0xce006d08 //eor3 v8.16b, v8.16b, v0.16b, v27.16b //AES block 0 - result + rev32 v0.16b, v30.16b //CTR block 8 + add v30.4s, v30.4s, v31.4s //CTR block 8 + +.inst 0xce016d29 //eor3 v9.16b, v9.16b, v1.16b, v27.16b //AES block 1 - result + stp q8, q9, [x2], #32 //AES block 0, 1 - store result + + rev32 v1.16b, v30.16b //CTR block 9 +.inst 0xce056dad //eor3 v13.16b, v13.16b, v5.16b, v27.16b //AES block 5 - result + add v30.4s, v30.4s, v31.4s //CTR block 9 + +.inst 0xce026d4a //eor3 v10.16b, v10.16b, v2.16b, v27.16b //AES block 2 - result +.inst 0xce066dce //eor3 v14.16b, v14.16b, v6.16b, v27.16b //AES block 6 - result +.inst 0xce046d8c //eor3 v12.16b, v12.16b, v4.16b, v27.16b //AES block 4 - result + + rev32 v2.16b, v30.16b //CTR block 10 + add v30.4s, v30.4s, v31.4s //CTR block 10 + +.inst 0xce036d6b //eor3 v11.16b, v11.16b, v3.16b, v27.16b //AES block 3 - result +.inst 0xce076def //eor3 v15.16b, v15.16b, v7.16b,v27.16b //AES block 7 - result + stp q10, q11, [x2], #32 //AES block 2, 3 - store result + + rev32 v3.16b, v30.16b //CTR block 11 + add v30.4s, v30.4s, v31.4s //CTR block 11 + stp q12, q13, [x2], #32 //AES block 4, 5 - store result + + stp q14, q15, [x2], #32 //AES block 6, 7 - store result + + rev32 v4.16b, v30.16b //CTR block 12 + add v30.4s, v30.4s, v31.4s //CTR block 12 + b.ge .L128_enc_prepretail //do prepretail + +.L128_enc_main_loop: //main loop start + rev32 v5.16b, v30.16b //CTR block 8k+13 + ldr q20, [x3, #128] //load h5l | h5h + ext v20.16b, v20.16b, v20.16b, #8 + ldr q22, [x3, #160] //load h6l | h6h + ext v22.16b, v22.16b, v22.16b, #8 + add v30.4s, v30.4s, v31.4s //CTR block 8k+13 + + rev64 v9.16b, v9.16b //GHASH block 8k+1 + rev64 v8.16b, v8.16b //GHASH block 8k + ldr q23, [x3, #176] //load h7l | h7h + ext v23.16b, v23.16b, v23.16b, #8 + ldr q25, [x3, #208] //load h8l | h8h + ext v25.16b, v25.16b, v25.16b, #8 + + rev32 v6.16b, v30.16b //CTR block 8k+14 + add v30.4s, v30.4s, v31.4s //CTR block 8k+14 + ext v19.16b, v19.16b, v19.16b, #8 //PRE 0 + + ldr q21, [x3, #144] //load h6k | h5k + ldr q24, [x3, #192] //load h8k | h7k + rev64 v13.16b, v13.16b //GHASH block 8k+5 (t0, t1, t2 and t3 free) + rev64 v11.16b, v11.16b //GHASH block 8k+3 + + ldp q26, q27, [x8, #0] //load rk0, rk1 + eor v8.16b, v8.16b, v19.16b //PRE 1 + rev32 v7.16b, v30.16b //CTR block 8k+15 + + rev64 v15.16b, v15.16b //GHASH block 8k+7 (t0, t1, t2 and t3 free) + + pmull2 v16.1q, v9.2d, v23.2d //GHASH block 8k+1 - high + rev64 v10.16b, v10.16b //GHASH block 8k+2 + pmull2 v17.1q, v8.2d, v25.2d //GHASH block 8k - high + + pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low + trn1 v18.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid + pmull v19.1q, v8.1d, v25.1d //GHASH block 8k - low + + trn2 v8.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid + pmull2 v29.1q, v10.2d, v22.2d //GHASH block 8k+2 - high + pmull2 v9.1q, v11.2d, v20.2d //GHASH block 8k+3 - high + + eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low + ldr q23, [x3, #80] //load h3l | h3h + ext v23.16b, v23.16b, v23.16b, #8 + ldr q25, [x3, #112] //load h3l | h3h + ext v25.16b, v25.16b, v25.16b, #8 + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 0 + + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 0 + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 0 + eor v17.16b, v17.16b, v16.16b //GHASH block 8k+1 - high + + add v30.4s, v30.4s, v31.4s //CTR block 8k+15 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 0 + eor v8.16b, v8.16b, v18.16b //GHASH block 8k, 8k+1 - mid + + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 0 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 1 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 0 + + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 1 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 0 + pmull v22.1q, v10.1d, v22.1d //GHASH block 8k+2 - low + + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 1 + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 0 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 1 + +.inst 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b,v9.16b //GHASH block 8k+2, 8k+3 - high + trn1 v29.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid + trn2 v10.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid + + ldp q28, q26, [x8, #32] //load rk2, rk3 + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 1 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 1 + + pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 1 + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 1 + + pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid + eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid + pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid + + rev64 v14.16b, v14.16b //GHASH block 8k+6 (t0, t1, and t2 free) +.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low + + pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid + eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid + pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid + + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 2 + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 2 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 2 + + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 2 +.inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 2 + + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 2 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 2 + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 2 + + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 3 + ldr q21, [x3, #48] //load h2k | h1k + ldr q24, [x3, #96] //load h4k | h3k + rev64 v12.16b, v12.16b //GHASH block 8k+4 (t0, t1, and t2 free) + + ldp q27, q28, [x8, #64] //load rk4, rk5 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 3 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 3 + + ldr q20, [x3, #32] //load h1l | h1h + ext v20.16b, v20.16b, v20.16b, #8 + ldr q22, [x3, #64] //load h1l | h1h + ext v22.16b, v22.16b, v22.16b, #8 + pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high + pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low + + trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid + trn2 v12.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 3 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 3 + + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 3 + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 3 + + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 3 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 4 + + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 4 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 4 + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 4 + + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 4 + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 4 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 4 + + pmull2 v10.1q, v13.2d, v23.2d //GHASH block 8k+5 - high + eor v12.16b, v12.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid + pmull v23.1q, v13.1d, v23.1d //GHASH block 8k+5 - low + + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 4 + ldp q26, q27, [x8, #96] //load rk6, rk7 + trn1 v13.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid + + pmull2 v16.1q, v12.2d, v24.2d //GHASH block 8k+4 - mid + pmull v24.1q, v12.1d, v24.1d //GHASH block 8k+5 - mid + pmull2 v11.1q, v14.2d, v22.2d //GHASH block 8k+6 - high + + pmull2 v12.1q, v15.2d, v20.2d //GHASH block 8k+7 - high + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 5 + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 5 + + pmull v22.1q, v14.1d, v22.1d //GHASH block 8k+6 - low +.inst 0xce082a31 //eor3 v17.16b, v17.16b, v8.16b, v10.16b //GHASH block 8k+4, 8k+5 - high + trn2 v14.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid + +.inst 0xce195e73 //eor3 v19.16b, v19.16b, v25.16b, v23.16b //GHASH block 8k+4, 8k+5 - low + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 5 + + eor v14.16b, v14.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 5 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 5 + + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 5 + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 5 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 5 + +.inst 0xce184252 //eor3 v18.16b, v18.16b, v24.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid + ldr d16, [x10] //MODULO - load modulo constant + pmull v20.1q, v15.1d, v20.1d //GHASH block 8k+7 - low + + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 6 + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 6 + + pmull2 v13.1q, v14.2d, v21.2d //GHASH block 8k+6 - mid + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 6 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 6 + + pmull v21.1q, v14.1d, v21.1d //GHASH block 8k+7 - mid +.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+6, 8k+7 - low + ldp q8, q9, [x0], #32 //AES block 8k+8, 8k+9 - load plaintext + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 6 + rev32 v20.16b, v30.16b //CTR block 8k+16 + add v30.4s, v30.4s, v31.4s //CTR block 8k+16 + + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 6 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 6 + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 6 + +.inst 0xce153652 //eor3 v18.16b, v18.16b, v21.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid + ldp q28, q26, [x8, #128] //load rk8, rk9 +.inst 0xce0b3231 //eor3 v17.16b, v17.16b, v11.16b, v12.16b //GHASH block 8k+6, 8k+7 - high + + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 7 + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 7 + ldp q10, q11, [x0], #32 //AES block 8k+10, 8k+11 - load plaintext + + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 7 + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 7 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 7 + + pmull v21.1q, v17.1d, v16.1d //MODULO - top 64b align with mid + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 7 + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 7 + + rev32 v22.16b, v30.16b //CTR block 8k+17 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 7 + + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 8 + ldp q12, q13, [x0], #32 //AES block 8k+12, 8k+13 - load plaintext + add v30.4s, v30.4s, v31.4s //CTR block 8k+17 + + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 8 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 8 + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 8 + + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 8 +.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up + ldr q27, [x8, #160] //load rk10 + + ext v29.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment + rev32 v23.16b, v30.16b //CTR block 8k+18 + add v30.4s, v30.4s, v31.4s //CTR block 8k+18 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 8 + + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 8 +.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 8 + + aese v2.16b, v26.16b //AES block 8k+10 - round 9 + aese v4.16b, v26.16b //AES block 8k+12 - round 9 + aese v1.16b, v26.16b //AES block 8k+9 - round 9 + + ldp q14, q15, [x0], #32 //AES block 8k+14, 8k+15 - load plaintext + rev32 v25.16b, v30.16b //CTR block 8k+19 + add v30.4s, v30.4s, v31.4s //CTR block 8k+19 + + cmp x0, x5 //.LOOP CONTROL +.inst 0xce046d8c //eor3 v12.16b, v12.16b, v4.16b, v27.16b //AES block 4 - result + aese v7.16b, v26.16b //AES block 8k+15 - round 9 + + aese v6.16b, v26.16b //AES block 8k+14 - round 9 + aese v3.16b, v26.16b //AES block 8k+11 - round 9 + +.inst 0xce026d4a //eor3 v10.16b, v10.16b, v2.16b, v27.16b //AES block 8k+10 - result + + mov v2.16b, v23.16b //CTR block 8k+18 + aese v0.16b, v26.16b //AES block 8k+8 - round 9 + + rev32 v4.16b, v30.16b //CTR block 8k+20 + add v30.4s, v30.4s, v31.4s //CTR block 8k+20 + +.inst 0xce076def //eor3 v15.16b, v15.16b, v7.16b, v27.16b //AES block 7 - result + aese v5.16b, v26.16b //AES block 8k+13 - round 9 + pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low + +.inst 0xce016d29 //eor3 v9.16b, v9.16b, v1.16b, v27.16b //AES block 8k+9 - result +.inst 0xce036d6b //eor3 v11.16b, v11.16b, v3.16b, v27.16b //AES block 8k+11 - result + mov v3.16b, v25.16b //CTR block 8k+19 + + ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment +.inst 0xce056dad //eor3 v13.16b, v13.16b, v5.16b, v27.16b //AES block 5 - result + mov v1.16b, v22.16b //CTR block 8k+17 + +.inst 0xce006d08 //eor3 v8.16b, v8.16b, v0.16b, v27.16b //AES block 8k+8 - result + mov v0.16b, v20.16b //CTR block 8k+16 + stp q8, q9, [x2], #32 //AES block 8k+8, 8k+9 - store result + + stp q10, q11, [x2], #32 //AES block 8k+10, 8k+11 - store result +.inst 0xce066dce //eor3 v14.16b, v14.16b, v6.16b, v27.16b //AES block 6 - result + + stp q12, q13, [x2], #32 //AES block 8k+12, 8k+13 - store result +.inst 0xce115673 //eor3 v19.16b, v19.16b, v17.16b, v21.16b //MODULO - fold into low + + stp q14, q15, [x2], #32 //AES block 8k+14, 8k+15 - store result + b.lt .L128_enc_main_loop + +.L128_enc_prepretail: //PREPRETAIL + rev32 v5.16b, v30.16b //CTR block 8k+13 + ldr q23, [x3, #176] //load h7l | h7h + ext v23.16b, v23.16b, v23.16b, #8 + ldr q25, [x3, #208] //load h8l | h8h + ext v25.16b, v25.16b, v25.16b, #8 + ext v19.16b, v19.16b, v19.16b, #8 //PRE 0 + + ldr q20, [x3, #128] //load h5l | h5h + ext v20.16b, v20.16b, v20.16b, #8 + ldr q22, [x3, #160] //load h6l | h6h + ext v22.16b, v22.16b, v22.16b, #8 + rev64 v8.16b, v8.16b //GHASH block 8k + rev64 v9.16b, v9.16b //GHASH block 8k+1 + + ldr q21, [x3, #144] //load h6k | h5k + ldr q24, [x3, #192] //load h6k | h5k + add v30.4s, v30.4s, v31.4s //CTR block 8k+13 + rev64 v11.16b, v11.16b //GHASH block 8k+3 + + rev64 v10.16b, v10.16b //GHASH block 8k+2 + eor v8.16b, v8.16b, v19.16b //PRE 1 + + rev32 v6.16b, v30.16b //CTR block 8k+14 + + pmull2 v16.1q, v9.2d, v23.2d //GHASH block 8k+1 - high + pmull v19.1q, v8.1d, v25.1d //GHASH block 8k - low + pmull2 v17.1q, v8.2d, v25.2d //GHASH block 8k - high + + rev64 v13.16b, v13.16b //GHASH block 8k+5 (t0, t1, t2 and t3 free) + trn1 v18.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid + + pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low + eor v17.16b, v17.16b, v16.16b //GHASH block 8k+1 - high + trn2 v8.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid + + eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low + eor v8.16b, v8.16b, v18.16b //GHASH block 8k, 8k+1 - mid + + ldp q26, q27, [x8, #0] //load rk0, rk1 + add v30.4s, v30.4s, v31.4s //CTR block 8k+14 + + pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid + pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid + + rev64 v12.16b, v12.16b //GHASH block 8k+4 (t0, t1, and t2 free) + rev64 v15.16b, v15.16b //GHASH block 8k+7 (t0, t1, t2 and t3 free) + + eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid + + rev32 v7.16b, v30.16b //CTR block 8k+15 + + rev64 v14.16b, v14.16b //GHASH block 8k+6 (t0, t1, and t2 free) + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 0 + + pmull2 v9.1q, v11.2d, v20.2d //GHASH block 8k+3 - high + pmull2 v29.1q, v10.2d, v22.2d //GHASH block 8k+2 - high + + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 0 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 0 + + pmull v22.1q, v10.1d, v22.1d //GHASH block 8k+2 - low + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 0 + +.inst 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b, v9.16b //GHASH block 8k+2, 8k+3 - high + trn1 v29.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid + trn2 v10.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid + + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 0 + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 0 + + eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 0 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 0 + + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 1 + pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low + + ldr q23, [x3, #80] //load h3l | h3h + ext v23.16b, v23.16b, v23.16b, #8 + ldr q25, [x3, #112] //load h4l | h4h + ext v25.16b, v25.16b, v25.16b, #8 + + ldp q28, q26, [x8, #32] //load rk2, rk3 + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 1 + pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid + +.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low + pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid + + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 1 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 1 + +.inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid + ldr q21, [x3, #48] //load h2k | h1k + ldr q24, [x3, #96] //load h4k | h3k + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 1 + + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 1 + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 1 + + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 2 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 2 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 2 + + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 2 + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 1 + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 2 + + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 3 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 2 + + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 2 + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 2 + ldp q27, q28, [x8, #64] //load rk4, rk5 + + ldr q20, [x3, #32] //load h1l | h1h + ext v20.16b, v20.16b, v20.16b, #8 + ldr q22, [x3, #64] //load h1l | h1h + ext v22.16b, v22.16b, v22.16b, #8 + trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 3 + + pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 3 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 3 + + pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low + trn2 v12.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid + pmull2 v10.1q, v13.2d, v23.2d //GHASH block 8k+5 - high + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 3 + add v30.4s, v30.4s, v31.4s //CTR block 8k+15 + + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 3 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 3 + eor v12.16b, v12.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid + + pmull v23.1q, v13.1d, v23.1d //GHASH block 8k+5 - low + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 3 + pmull2 v11.1q, v14.2d, v22.2d //GHASH block 8k+6 - high + + trn1 v13.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid + pmull v22.1q, v14.1d, v22.1d //GHASH block 8k+6 - low + trn2 v14.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid + + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 4 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 4 +.inst 0xce082a31 //eor3 v17.16b, v17.16b, v8.16b, v10.16b //GHASH block 8k+4, 8k+5 - high + +.inst 0xce195e73 //eor3 v19.16b, v19.16b, v25.16b, v23.16b //GHASH block 8k+4, 8k+5 - low + eor v14.16b, v14.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid + pmull2 v16.1q, v12.2d, v24.2d //GHASH block 8k+4 - mid + + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 5 + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 4 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 4 + + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 4 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 4 + + pmull v24.1q, v12.1d, v24.1d //GHASH block 8k+5 - mid + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 4 + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 4 + + pmull2 v12.1q, v15.2d, v20.2d //GHASH block 8k+7 - high + ldp q26, q27, [x8, #96] //load rk6, rk7 + pmull v20.1q, v15.1d, v20.1d //GHASH block 8k+7 - low + +.inst 0xce184252 //eor3 v18.16b, v18.16b, v24.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid + pmull2 v13.1q, v14.2d, v21.2d //GHASH block 8k+6 - mid + pmull v21.1q, v14.1d, v21.1d //GHASH block 8k+7 - mid + + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 5 + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 5 + ldr d16, [x10] //MODULO - load modulo constant + + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 5 + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 5 + +.inst 0xce0b3231 //eor3 v17.16b, v17.16b, v11.16b, v12.16b //GHASH block 8k+6, 8k+7 - high + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 5 + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 5 + + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 5 + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 6 + + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 6 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 6 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 6 + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 6 +.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+6, 8k+7 - low +.inst 0xce153652 //eor3 v18.16b, v18.16b, v21.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid + + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 6 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 6 + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 6 + + pmull v21.1q, v17.1d, v16.1d //MODULO - top 64b align with mid +.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up + ldp q28, q26, [x8, #128] //load rk8, rk9 + + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 7 + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 7 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 7 + ext v29.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment + + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 7 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 7 +.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid + + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 7 + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 7 + + pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 7 + + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 8 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 8 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 8 + ext v18.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment + + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 8 +.inst 0xce114a73 //eor3 v19.16b, v19.16b, v17.16b, v18.16b //MODULO - fold into low + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 8 + + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 8 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 8 + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 8 + + ldr q27, [x8, #160] //load rk10 + aese v6.16b, v26.16b //AES block 8k+14 - round 9 + aese v2.16b, v26.16b //AES block 8k+10 - round 9 + + aese v0.16b, v26.16b //AES block 8k+8 - round 9 + aese v1.16b, v26.16b //AES block 8k+9 - round 9 + + aese v3.16b, v26.16b //AES block 8k+11 - round 9 + aese v5.16b, v26.16b //AES block 8k+13 - round 9 + + aese v4.16b, v26.16b //AES block 8k+12 - round 9 + aese v7.16b, v26.16b //AES block 8k+15 - round 9 +.L128_enc_tail: //TAIL + + sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process + ldr q8, [x0], #16 //AES block 8k+8 - load plaintext + + mov v29.16b, v27.16b + ldp q20, q21, [x3, #128] //load h5l | h5h + ext v20.16b, v20.16b, v20.16b, #8 + +.inst 0xce007509 //eor3 v9.16b, v8.16b, v0.16b, v29.16b //AES block 8k+8 - result + ext v16.16b, v19.16b, v19.16b, #8 //prepare final partial tag + ldp q22, q23, [x3, #160] //load h6l | h6h + ext v22.16b, v22.16b, v22.16b, #8 + ext v23.16b, v23.16b, v23.16b, #8 + + ldp q24, q25, [x3, #192] //load h8k | h7k + ext v25.16b, v25.16b, v25.16b, #8 + cmp x5, #112 + b.gt .L128_enc_blocks_more_than_7 + + mov v7.16b, v6.16b + mov v6.16b, v5.16b + movi v17.8b, #0 + + cmp x5, #96 + sub v30.4s, v30.4s, v31.4s + mov v5.16b, v4.16b + + mov v4.16b, v3.16b + mov v3.16b, v2.16b + mov v2.16b, v1.16b + + movi v19.8b, #0 + movi v18.8b, #0 + b.gt .L128_enc_blocks_more_than_6 + + mov v7.16b, v6.16b + cmp x5, #80 + + sub v30.4s, v30.4s, v31.4s + mov v6.16b, v5.16b + mov v5.16b, v4.16b + + mov v4.16b, v3.16b + mov v3.16b, v1.16b + b.gt .L128_enc_blocks_more_than_5 + + cmp x5, #64 + sub v30.4s, v30.4s, v31.4s + + mov v7.16b, v6.16b + mov v6.16b, v5.16b + + mov v5.16b, v4.16b + mov v4.16b, v1.16b + b.gt .L128_enc_blocks_more_than_4 + + mov v7.16b, v6.16b + sub v30.4s, v30.4s, v31.4s + mov v6.16b, v5.16b + + mov v5.16b, v1.16b + cmp x5, #48 + b.gt .L128_enc_blocks_more_than_3 + + sub v30.4s, v30.4s, v31.4s + mov v7.16b, v6.16b + mov v6.16b, v1.16b + + cmp x5, #32 + ldr q24, [x3, #96] //load h4k | h3k + b.gt .L128_enc_blocks_more_than_2 + + cmp x5, #16 + + sub v30.4s, v30.4s, v31.4s + mov v7.16b, v1.16b + b.gt .L128_enc_blocks_more_than_1 + + ldr q21, [x3, #48] //load h2k | h1k + sub v30.4s, v30.4s, v31.4s + b .L128_enc_blocks_less_than_1 +.L128_enc_blocks_more_than_7: //blocks left > 7 + st1 { v9.16b}, [x2], #16 //AES final-7 block - store result + + rev64 v8.16b, v9.16b //GHASH final-7 block + ldr q9, [x0], #16 //AES final-6 block - load plaintext + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + + ins v27.d[0], v8.d[1] //GHASH final-7 block - mid + + pmull2 v17.1q, v8.2d, v25.2d //GHASH final-7 block - high + + ins v18.d[0], v24.d[1] //GHASH final-7 block - mid + + eor v27.8b, v27.8b, v8.8b //GHASH final-7 block - mid + movi v16.8b, #0 //suppress further partial tag feed in + +.inst 0xce017529 //eor3 v9.16b, v9.16b, v1.16b, v29.16b //AES final-6 block - result + + pmull v18.1q, v27.1d, v18.1d //GHASH final-7 block - mid + pmull v19.1q, v8.1d, v25.1d //GHASH final-7 block - low +.L128_enc_blocks_more_than_6: //blocks left > 6 + + st1 { v9.16b}, [x2], #16 //AES final-6 block - store result + + rev64 v8.16b, v9.16b //GHASH final-6 block + ldr q9, [x0], #16 //AES final-5 block - load plaintext + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + + ins v27.d[0], v8.d[1] //GHASH final-6 block - mid + +.inst 0xce027529 //eor3 v9.16b, v9.16b, v2.16b, v29.16b //AES final-5 block - result + pmull v26.1q, v8.1d, v23.1d //GHASH final-6 block - low + + eor v27.8b, v27.8b, v8.8b //GHASH final-6 block - mid + movi v16.8b, #0 //suppress further partial tag feed in + + pmull v27.1q, v27.1d, v24.1d //GHASH final-6 block - mid + pmull2 v28.1q, v8.2d, v23.2d //GHASH final-6 block - high + + eor v19.16b, v19.16b, v26.16b //GHASH final-6 block - low + + eor v18.16b, v18.16b, v27.16b //GHASH final-6 block - mid + eor v17.16b, v17.16b, v28.16b //GHASH final-6 block - high +.L128_enc_blocks_more_than_5: //blocks left > 5 + + st1 { v9.16b}, [x2], #16 //AES final-5 block - store result + + rev64 v8.16b, v9.16b //GHASH final-5 block + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + + ins v27.d[0], v8.d[1] //GHASH final-5 block - mid + ldr q9, [x0], #16 //AES final-4 block - load plaintext + pmull2 v28.1q, v8.2d, v22.2d //GHASH final-5 block - high + + eor v17.16b, v17.16b, v28.16b //GHASH final-5 block - high + + eor v27.8b, v27.8b, v8.8b //GHASH final-5 block - mid + + ins v27.d[1], v27.d[0] //GHASH final-5 block - mid + +.inst 0xce037529 //eor3 v9.16b, v9.16b, v3.16b, v29.16b //AES final-4 block - result + pmull v26.1q, v8.1d, v22.1d //GHASH final-5 block - low + movi v16.8b, #0 //suppress further partial tag feed in + + pmull2 v27.1q, v27.2d, v21.2d //GHASH final-5 block - mid + eor v19.16b, v19.16b, v26.16b //GHASH final-5 block - low + + eor v18.16b, v18.16b, v27.16b //GHASH final-5 block - mid +.L128_enc_blocks_more_than_4: //blocks left > 4 + + st1 { v9.16b}, [x2], #16 //AES final-4 block - store result + + rev64 v8.16b, v9.16b //GHASH final-4 block + + ldr q9, [x0], #16 //AES final-3 block - load plaintext + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + + ins v27.d[0], v8.d[1] //GHASH final-4 block - mid + movi v16.8b, #0 //suppress further partial tag feed in + pmull2 v28.1q, v8.2d, v20.2d //GHASH final-4 block - high + + eor v27.8b, v27.8b, v8.8b //GHASH final-4 block - mid + + pmull v26.1q, v8.1d, v20.1d //GHASH final-4 block - low + + eor v17.16b, v17.16b, v28.16b //GHASH final-4 block - high + pmull v27.1q, v27.1d, v21.1d //GHASH final-4 block - mid + + eor v19.16b, v19.16b, v26.16b //GHASH final-4 block - low + +.inst 0xce047529 //eor3 v9.16b, v9.16b, v4.16b, v29.16b //AES final-3 block - result + eor v18.16b, v18.16b, v27.16b //GHASH final-4 block - mid +.L128_enc_blocks_more_than_3: //blocks left > 3 + + st1 { v9.16b}, [x2], #16 //AES final-3 block - store result + + ldr q25, [x3, #112] //load h4l | h4h + ext v25.16b, v25.16b, v25.16b, #8 + + rev64 v8.16b, v9.16b //GHASH final-3 block + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + movi v16.8b, #0 //suppress further partial tag feed in + + ins v27.d[0], v8.d[1] //GHASH final-3 block - mid + ldr q24, [x3, #96] //load h4k | h3k + pmull v26.1q, v8.1d, v25.1d //GHASH final-3 block - low + + ldr q9, [x0], #16 //AES final-2 block - load plaintext + + eor v27.8b, v27.8b, v8.8b //GHASH final-3 block - mid + + ins v27.d[1], v27.d[0] //GHASH final-3 block - mid + eor v19.16b, v19.16b, v26.16b //GHASH final-3 block - low + +.inst 0xce057529 //eor3 v9.16b, v9.16b, v5.16b, v29.16b //AES final-2 block - result + + pmull2 v27.1q, v27.2d, v24.2d //GHASH final-3 block - mid + pmull2 v28.1q, v8.2d, v25.2d //GHASH final-3 block - high + + eor v18.16b, v18.16b, v27.16b //GHASH final-3 block - mid + eor v17.16b, v17.16b, v28.16b //GHASH final-3 block - high +.L128_enc_blocks_more_than_2: //blocks left > 2 + + st1 { v9.16b}, [x2], #16 //AES final-2 block - store result + + rev64 v8.16b, v9.16b //GHASH final-2 block + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + + ldr q9, [x0], #16 //AES final-1 block - load plaintext + + ins v27.d[0], v8.d[1] //GHASH final-2 block - mid + ldr q23, [x3, #80] //load h3l | h3h + ext v23.16b, v23.16b, v23.16b, #8 + movi v16.8b, #0 //suppress further partial tag feed in + + eor v27.8b, v27.8b, v8.8b //GHASH final-2 block - mid +.inst 0xce067529 //eor3 v9.16b, v9.16b, v6.16b, v29.16b //AES final-1 block - result + + pmull2 v28.1q, v8.2d, v23.2d //GHASH final-2 block - high + + pmull v26.1q, v8.1d, v23.1d //GHASH final-2 block - low + pmull v27.1q, v27.1d, v24.1d //GHASH final-2 block - mid + + eor v17.16b, v17.16b, v28.16b //GHASH final-2 block - high + + eor v18.16b, v18.16b, v27.16b //GHASH final-2 block - mid + eor v19.16b, v19.16b, v26.16b //GHASH final-2 block - low +.L128_enc_blocks_more_than_1: //blocks left > 1 + + st1 { v9.16b}, [x2], #16 //AES final-1 block - store result + + ldr q22, [x3, #64] //load h2l | h2h + ext v22.16b, v22.16b, v22.16b, #8 + rev64 v8.16b, v9.16b //GHASH final-1 block + ldr q9, [x0], #16 //AES final block - load plaintext + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + + movi v16.8b, #0 //suppress further partial tag feed in + ins v27.d[0], v8.d[1] //GHASH final-1 block - mid +.inst 0xce077529 //eor3 v9.16b, v9.16b, v7.16b, v29.16b //AES final block - result + + pmull2 v28.1q, v8.2d, v22.2d //GHASH final-1 block - high + + eor v27.8b, v27.8b, v8.8b //GHASH final-1 block - mid + + ldr q21, [x3, #48] //load h2k | h1k + + ins v27.d[1], v27.d[0] //GHASH final-1 block - mid + + pmull v26.1q, v8.1d, v22.1d //GHASH final-1 block - low + pmull2 v27.1q, v27.2d, v21.2d //GHASH final-1 block - mid + + eor v17.16b, v17.16b, v28.16b //GHASH final-1 block - high + + eor v18.16b, v18.16b, v27.16b //GHASH final-1 block - mid + eor v19.16b, v19.16b, v26.16b //GHASH final-1 block - low +.L128_enc_blocks_less_than_1: //blocks left <= 1 + + rev32 v30.16b, v30.16b + str q30, [x16] //store the updated counter + and x1, x1, #127 //bit_length %= 128 + + sub x1, x1, #128 //bit_length -= 128 + + neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128]) + + mvn x6, xzr //temp0_x = 0xffffffffffffffff + ld1 { v26.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored + and x1, x1, #127 //bit_length %= 128 + + lsr x6, x6, x1 //temp0_x is mask for top 64b of last block + mvn x7, xzr //temp1_x = 0xffffffffffffffff + cmp x1, #64 + + csel x13, x7, x6, lt + csel x14, x6, xzr, lt + + mov v0.d[1], x14 + mov v0.d[0], x13 //ctr0b is mask for last block + + and v9.16b, v9.16b, v0.16b //possibly partial last block has zeroes in highest bits + + rev64 v8.16b, v9.16b //GHASH final block + + bif v9.16b, v26.16b, v0.16b //insert existing bytes in top end of result before storing + st1 { v9.16b}, [x2] //store all 16B + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + + ins v16.d[0], v8.d[1] //GHASH final block - mid + + eor v16.8b, v16.8b, v8.8b //GHASH final block - mid + ldr q20, [x3, #32] //load h1l | h1h + ext v20.16b, v20.16b, v20.16b, #8 + + pmull v16.1q, v16.1d, v21.1d //GHASH final block - mid + + pmull2 v28.1q, v8.2d, v20.2d //GHASH final block - high + eor v18.16b, v18.16b, v16.16b //GHASH final block - mid + ldr d16, [x10] //MODULO - load modulo constant + + pmull v26.1q, v8.1d, v20.1d //GHASH final block - low + + eor v17.16b, v17.16b, v28.16b //GHASH final block - high + + eor v19.16b, v19.16b, v26.16b //GHASH final block - low + + ext v21.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment + pmull v29.1q, v17.1d, v16.1d //MODULO - top 64b align with mid + +.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up + +.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid + + pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low + ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment + +.inst 0xce115673 //eor3 v19.16b, v19.16b, v17.16b, v21.16b //MODULO - fold into low + ext v19.16b, v19.16b, v19.16b, #8 + rev64 v19.16b, v19.16b + st1 { v19.16b }, [x3] + mov x0, x9 + + ldp d10, d11, [sp, #16] + ldp d12, d13, [sp, #32] + ldp d14, d15, [sp, #48] + ldp d8, d9, [sp], #80 + ret + +.L128_enc_ret: + mov w0, #0x0 + ret +.size unroll8_eor3_aes_gcm_enc_128_kernel,.-unroll8_eor3_aes_gcm_enc_128_kernel +.globl unroll8_eor3_aes_gcm_dec_128_kernel +.type unroll8_eor3_aes_gcm_dec_128_kernel,%function +.align 4 +unroll8_eor3_aes_gcm_dec_128_kernel: + AARCH64_VALID_CALL_TARGET + cbz x1, .L128_dec_ret + stp d8, d9, [sp, #-80]! + lsr x9, x1, #3 + mov x16, x4 + mov x8, x5 + stp d10, d11, [sp, #16] + stp d12, d13, [sp, #32] + stp d14, d15, [sp, #48] + mov x5, #0xc200000000000000 + stp x5, xzr, [sp, #64] + add x10, sp, #64 + + mov x5, x9 + ld1 { v0.16b}, [x16] //CTR block 0 + + ldp q26, q27, [x8, #0] //load rk0, rk1 + sub x5, x5, #1 //byte_len - 1 + + mov x15, #0x100000000 //set up counter increment + movi v31.16b, #0x0 + mov v31.d[1], x15 + ld1 { v19.16b}, [x3] + ext v19.16b, v19.16b, v19.16b, #8 + rev64 v19.16b, v19.16b + + rev32 v30.16b, v0.16b //set up reversed counter + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 0 - round 0 + + add v30.4s, v30.4s, v31.4s //CTR block 0 + + rev32 v1.16b, v30.16b //CTR block 1 + add v30.4s, v30.4s, v31.4s //CTR block 1 + + and x5, x5, #0xffffffffffffff80 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail) + + rev32 v2.16b, v30.16b //CTR block 2 + add v30.4s, v30.4s, v31.4s //CTR block 2 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 1 - round 0 + + rev32 v3.16b, v30.16b //CTR block 3 + add v30.4s, v30.4s, v31.4s //CTR block 3 + + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 0 - round 1 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 1 - round 1 + + rev32 v4.16b, v30.16b //CTR block 4 + add v30.4s, v30.4s, v31.4s //CTR block 4 + + rev32 v5.16b, v30.16b //CTR block 5 + add v30.4s, v30.4s, v31.4s //CTR block 5 + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 2 - round 0 + + rev32 v6.16b, v30.16b //CTR block 6 + add v30.4s, v30.4s, v31.4s //CTR block 6 + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 5 - round 0 + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 3 - round 0 + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 4 - round 0 + + rev32 v7.16b, v30.16b //CTR block 7 + + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 6 - round 0 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 2 - round 1 + + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 7 - round 0 + + ldp q28, q26, [x8, #32] //load rk2, rk3 + + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 6 - round 1 + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 5 - round 1 + + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 4 - round 1 + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 7 - round 1 + + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 7 - round 2 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 0 - round 2 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 3 - round 1 + + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 6 - round 2 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 2 - round 2 + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 5 - round 2 + + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 4 - round 2 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 3 - round 2 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 1 - round 2 + + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 6 - round 3 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 2 - round 3 + + ldp q27, q28, [x8, #64] //load rk4, rk5 + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 5 - round 3 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 0 - round 3 + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 7 - round 3 + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 3 - round 3 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 1 - round 3 + + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 0 - round 4 + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 7 - round 4 + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 4 - round 3 + + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 6 - round 4 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 1 - round 4 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 3 - round 4 + + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 5 - round 4 + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 4 - round 4 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 2 - round 4 + + ldp q26, q27, [x8, #96] //load rk6, rk7 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 2 - round 5 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 3 - round 5 + + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 6 - round 5 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 1 - round 5 + + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 7 - round 5 + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 5 - round 5 + + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 4 - round 5 + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 3 - round 6 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 2 - round 6 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 0 - round 5 + + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 5 - round 6 + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 4 - round 6 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 1 - round 6 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 0 - round 6 + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 7 - round 6 + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 6 - round 6 + + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 3 - round 7 + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 4 - round 7 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 1 - round 7 + + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 7 - round 7 + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 5 - round 7 + ldp q28, q26, [x8, #128] //load rk8, rk9 + + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 6 - round 7 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 2 - round 7 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 0 - round 7 + + add x5, x5, x0 + add v30.4s, v30.4s, v31.4s //CTR block 7 + + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 6 - round 8 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 0 - round 8 + + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 1 - round 8 + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 7 - round 8 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 3 - round 8 + + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 5 - round 8 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 2 - round 8 + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 4 - round 8 + + aese v0.16b, v26.16b //AES block 0 - round 9 + aese v1.16b, v26.16b //AES block 1 - round 9 + aese v6.16b, v26.16b //AES block 6 - round 9 + + ldr q27, [x8, #160] //load rk10 + aese v4.16b, v26.16b //AES block 4 - round 9 + aese v3.16b, v26.16b //AES block 3 - round 9 + + aese v2.16b, v26.16b //AES block 2 - round 9 + aese v5.16b, v26.16b //AES block 5 - round 9 + aese v7.16b, v26.16b //AES block 7 - round 9 + + add x4, x0, x1, lsr #3 //end_input_ptr + cmp x0, x5 //check if we have <= 8 blocks + b.ge .L128_dec_tail //handle tail + + ldp q8, q9, [x0], #32 //AES block 0, 1 - load ciphertext + +.inst 0xce006d00 //eor3 v0.16b, v8.16b, v0.16b, v27.16b //AES block 0 - result +.inst 0xce016d21 //eor3 v1.16b, v9.16b, v1.16b, v27.16b //AES block 1 - result + stp q0, q1, [x2], #32 //AES block 0, 1 - store result + + rev32 v0.16b, v30.16b //CTR block 8 + add v30.4s, v30.4s, v31.4s //CTR block 8 + ldp q10, q11, [x0], #32 //AES block 2, 3 - load ciphertext + + ldp q12, q13, [x0], #32 //AES block 4, 5 - load ciphertext + + rev32 v1.16b, v30.16b //CTR block 9 + add v30.4s, v30.4s, v31.4s //CTR block 9 + ldp q14, q15, [x0], #32 //AES block 6, 7 - load ciphertext + +.inst 0xce036d63 //eor3 v3.16b, v11.16b, v3.16b, v27.16b //AES block 3 - result +.inst 0xce026d42 //eor3 v2.16b, v10.16b, v2.16b, v27.16b //AES block 2 - result + stp q2, q3, [x2], #32 //AES block 2, 3 - store result + + rev32 v2.16b, v30.16b //CTR block 10 + add v30.4s, v30.4s, v31.4s //CTR block 10 + +.inst 0xce066dc6 //eor3 v6.16b, v14.16b, v6.16b, v27.16b //AES block 6 - result + + rev32 v3.16b, v30.16b //CTR block 11 + add v30.4s, v30.4s, v31.4s //CTR block 11 + +.inst 0xce046d84 //eor3 v4.16b, v12.16b, v4.16b, v27.16b //AES block 4 - result +.inst 0xce056da5 //eor3 v5.16b, v13.16b, v5.16b, v27.16b //AES block 5 - result + stp q4, q5, [x2], #32 //AES block 4, 5 - store result + +.inst 0xce076de7 //eor3 v7.16b, v15.16b, v7.16b, v27.16b //AES block 7 - result + stp q6, q7, [x2], #32 //AES block 6, 7 - store result + rev32 v4.16b, v30.16b //CTR block 12 + + cmp x0, x5 //check if we have <= 8 blocks + add v30.4s, v30.4s, v31.4s //CTR block 12 + b.ge .L128_dec_prepretail //do prepretail + +.L128_dec_main_loop: //main loop start + ldr q23, [x3, #176] //load h7l | h7h + ext v23.16b, v23.16b, v23.16b, #8 + ldr q25, [x3, #208] //load h8l | h8h + ext v25.16b, v25.16b, v25.16b, #8 + + rev64 v9.16b, v9.16b //GHASH block 8k+1 + rev64 v8.16b, v8.16b //GHASH block 8k + ext v19.16b, v19.16b, v19.16b, #8 //PRE 0 + + rev64 v14.16b, v14.16b //GHASH block 8k+6 + ldr q20, [x3, #128] //load h5l | h5h + ext v20.16b, v20.16b, v20.16b, #8 + ldr q22, [x3, #160] //load h6l | h6h + ext v22.16b, v22.16b, v22.16b, #8 + + eor v8.16b, v8.16b, v19.16b //PRE 1 + rev32 v5.16b, v30.16b //CTR block 8k+13 + add v30.4s, v30.4s, v31.4s //CTR block 8k+13 + + rev64 v10.16b, v10.16b //GHASH block 8k+2 + rev64 v12.16b, v12.16b //GHASH block 8k+4 + ldp q26, q27, [x8, #0] //load rk0, rk1 + + rev32 v6.16b, v30.16b //CTR block 8k+14 + add v30.4s, v30.4s, v31.4s //CTR block 8k+14 + ldr q21, [x3, #144] //load h6k | h5k + ldr q24, [x3, #192] //load h8k | h7k + + pmull2 v16.1q, v9.2d, v23.2d //GHASH block 8k+1 - high + pmull2 v17.1q, v8.2d, v25.2d //GHASH block 8k - high + rev64 v11.16b, v11.16b //GHASH block 8k+3 + + rev32 v7.16b, v30.16b //CTR block 8k+15 + trn1 v18.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid + rev64 v13.16b, v13.16b //GHASH block 8k+5 + + pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low + pmull v19.1q, v8.1d, v25.1d //GHASH block 8k - low + trn2 v8.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid + + pmull2 v29.1q, v10.2d, v22.2d //GHASH block 8k+2 - high + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 0 + pmull2 v9.1q, v11.2d, v20.2d //GHASH block 8k+3 - high + + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 0 + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 0 + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 0 + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 0 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 0 + eor v17.16b, v17.16b, v16.16b //GHASH block 8k+1 - high + + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 0 + eor v8.16b, v8.16b, v18.16b //GHASH block 8k, 8k+1 - mid + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 0 + + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 1 + eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low +.inst 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b, v9.16b //GHASH block 8k+2, 8k+3 - high + + ldp q28, q26, [x8, #32] //load rk2, rk3 + trn1 v29.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 1 + + pmull v22.1q, v10.1d, v22.1d //GHASH block 8k+2 - low + trn2 v10.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid + pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid + + ldr q23, [x3, #80] //load h3l | h3h + ext v23.16b, v23.16b, v23.16b, #8 + ldr q25, [x3, #112] //load h4l | h4h + ext v25.16b, v25.16b, v25.16b, #8 + pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 1 + + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 1 + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 1 + pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low + + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 1 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 1 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 1 + + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 2 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 2 +.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low + + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 2 + eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid + ldr q20, [x3, #32] //load h1l | h1h + ext v20.16b, v20.16b, v20.16b, #8 + ldr q22, [x3, #64] //load h2l | h2h + ext v22.16b, v22.16b, v22.16b, #8 + + eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 2 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 2 + + trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 2 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 2 + + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 2 + pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid + pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid + + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 3 + rev64 v15.16b, v15.16b //GHASH block 8k+7 + pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high + + ldp q27, q28, [x8, #64] //load rk4, rk5 + pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low +.inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid + + ldr q21, [x3, #48] //load h2k | h1k + ldr q24, [x3, #96] //load h4k | h3k + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 3 + trn2 v12.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid + + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 3 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 3 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 3 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 3 + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 3 + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 3 + + pmull2 v10.1q, v13.2d, v23.2d //GHASH block 8k+5 - high + pmull v23.1q, v13.1d, v23.1d //GHASH block 8k+5 - low + pmull2 v11.1q, v14.2d, v22.2d //GHASH block 8k+6 - high + + pmull v22.1q, v14.1d, v22.1d //GHASH block 8k+6 - low + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 4 + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 4 + + eor v12.16b, v12.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid + trn1 v13.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 4 + + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 4 + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 4 + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 4 + + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 4 + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 4 + trn2 v14.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid + + ldp q26, q27, [x8, #96] //load rk6, rk7 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 5 + pmull2 v16.1q, v12.2d, v24.2d //GHASH block 8k+4 - mid + + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 5 + eor v14.16b, v14.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 5 + + pmull v24.1q, v12.1d, v24.1d //GHASH block 8k+5 - mid + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 5 + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 5 + + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 5 + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 5 + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 5 + + pmull2 v12.1q, v15.2d, v20.2d //GHASH block 8k+7 - high +.inst 0xce184252 //eor3 v18.16b, v18.16b, v24.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid +.inst 0xce195e73 //eor3 v19.16b, v19.16b, v25.16b, v23.16b //GHASH block 8k+4, 8k+5 - low + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 6 +.inst 0xce082a31 //eor3 v17.16b, v17.16b, v8.16b, v10.16b //GHASH block 8k+4, 8k+5 - high + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 6 + + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 6 + pmull2 v13.1q, v14.2d, v21.2d //GHASH block 8k+6 - mid + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 6 + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 6 + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 6 + pmull v20.1q, v15.1d, v20.1d //GHASH block 8k+7 - low + + pmull v21.1q, v14.1d, v21.1d //GHASH block 8k+7 - mid + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 6 + add v30.4s, v30.4s, v31.4s //CTR block 8k+15 + +.inst 0xce0b3231 //eor3 v17.16b, v17.16b, v11.16b, v12.16b //GHASH block 8k+6, 8k+7 - high + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 6 + ldp q28, q26, [x8, #128] //load rk8, rk9 + + ldr d16, [x10] //MODULO - load modulo constant +.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+6, 8k+7 - low + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 7 + + rev32 v20.16b, v30.16b //CTR block 8k+16 +.inst 0xce153652 //eor3 v18.16b, v18.16b, v21.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid + add v30.4s, v30.4s, v31.4s //CTR block 8k+16 + + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 7 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 7 + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 7 + + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 7 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 7 + rev32 v22.16b, v30.16b //CTR block 8k+17 + + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 7 + ext v21.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment + pmull v29.1q, v17.1d, v16.1d //MODULO - top 64b align with mid + +.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 7 + add v30.4s, v30.4s, v31.4s //CTR block 8k+17 + + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 8 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 8 + ldp q8, q9, [x0], #32 //AES block 8k+8, 8k+9 - load ciphertext + + ldp q10, q11, [x0], #32 //AES block 8k+10, 8k+11 - load ciphertext + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 8 + rev32 v23.16b, v30.16b //CTR block 8k+18 + + ldp q12, q13, [x0], #32 //AES block 8k+12, 8k+13 - load ciphertext + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 8 +.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid + + ldp q14, q15, [x0], #32 //AES block 8k+14, 8k+15 - load ciphertext + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 8 + add v30.4s, v30.4s, v31.4s //CTR block 8k+18 + + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 8 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 8 + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 8 + + aese v0.16b, v26.16b //AES block 8k+8 - round 9 + aese v1.16b, v26.16b //AES block 8k+9 - round 9 + ldr q27, [x8, #160] //load rk10 + + aese v6.16b, v26.16b //AES block 8k+14 - round 9 + pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low + aese v2.16b, v26.16b //AES block 8k+10 - round 9 + + aese v7.16b, v26.16b //AES block 8k+15 - round 9 + aese v4.16b, v26.16b //AES block 8k+12 - round 9 + ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment + + rev32 v25.16b, v30.16b //CTR block 8k+19 + add v30.4s, v30.4s, v31.4s //CTR block 8k+19 + + aese v3.16b, v26.16b //AES block 8k+11 - round 9 + aese v5.16b, v26.16b //AES block 8k+13 - round 9 +.inst 0xce016d21 //eor3 v1.16b, v9.16b, v1.16b, v27.16b //AES block 8k+9 - result + +.inst 0xce006d00 //eor3 v0.16b, v8.16b, v0.16b, v27.16b //AES block 8k+8 - result +.inst 0xce076de7 //eor3 v7.16b, v15.16b, v7.16b, v27.16b //AES block 8k+15 - result +.inst 0xce066dc6 //eor3 v6.16b, v14.16b, v6.16b, v27.16b //AES block 8k+14 - result + +.inst 0xce026d42 //eor3 v2.16b, v10.16b, v2.16b, v27.16b //AES block 8k+10 - result + stp q0, q1, [x2], #32 //AES block 8k+8, 8k+9 - store result + mov v1.16b, v22.16b //CTR block 8k+17 + +.inst 0xce046d84 //eor3 v4.16b, v12.16b, v4.16b, v27.16b //AES block 8k+12 - result +.inst 0xce115673 //eor3 v19.16b, v19.16b, v17.16b, v21.16b //MODULO - fold into low + mov v0.16b, v20.16b //CTR block 8k+16 + +.inst 0xce036d63 //eor3 v3.16b, v11.16b, v3.16b, v27.16b //AES block 8k+11 - result + cmp x0, x5 //.LOOP CONTROL + stp q2, q3, [x2], #32 //AES block 8k+10, 8k+11 - store result + +.inst 0xce056da5 //eor3 v5.16b, v13.16b, v5.16b, v27.16b //AES block 8k+13 - result + mov v2.16b, v23.16b //CTR block 8k+18 + + stp q4, q5, [x2], #32 //AES block 8k+12, 8k+13 - store result + rev32 v4.16b, v30.16b //CTR block 8k+20 + add v30.4s, v30.4s, v31.4s //CTR block 8k+20 + + stp q6, q7, [x2], #32 //AES block 8k+14, 8k+15 - store result + mov v3.16b, v25.16b //CTR block 8k+19 + b.lt .L128_dec_main_loop + +.L128_dec_prepretail: //PREPRETAIL + rev64 v11.16b, v11.16b //GHASH block 8k+3 + ext v19.16b, v19.16b, v19.16b, #8 //PRE 0 + rev64 v8.16b, v8.16b //GHASH block 8k + + rev64 v10.16b, v10.16b //GHASH block 8k+2 + rev32 v5.16b, v30.16b //CTR block 8k+13 + ldp q26, q27, [x8, #0] //load rk0, rk1 + + ldr q23, [x3, #176] //load h7l | h7h + ext v23.16b, v23.16b, v23.16b, #8 + ldr q25, [x3, #208] //load h8l | h8h + ext v25.16b, v25.16b, v25.16b, #8 + eor v8.16b, v8.16b, v19.16b //PRE 1 + rev64 v9.16b, v9.16b //GHASH block 8k+1 + + add v30.4s, v30.4s, v31.4s //CTR block 8k+13 + ldr q20, [x3, #128] //load h5l | h5h + ext v20.16b, v20.16b, v20.16b, #8 + ldr q22, [x3, #160] //load h6l | h6h + ext v22.16b, v22.16b, v22.16b, #8 + rev64 v13.16b, v13.16b //GHASH block 8k+5 + + rev64 v12.16b, v12.16b //GHASH block 8k+4 + + rev64 v14.16b, v14.16b //GHASH block 8k+6 + + ldr q21, [x3, #144] //load h6k | h5k + ldr q24, [x3, #192] //load h8k | h7k + rev32 v6.16b, v30.16b //CTR block 8k+14 + add v30.4s, v30.4s, v31.4s //CTR block 8k+14 + + pmull2 v16.1q, v9.2d, v23.2d //GHASH block 8k+1 - high + pmull v19.1q, v8.1d, v25.1d //GHASH block 8k - low + pmull2 v17.1q, v8.2d, v25.2d //GHASH block 8k - high + + trn1 v18.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid + trn2 v8.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid + pmull2 v29.1q, v10.2d, v22.2d //GHASH block 8k+2 - high + + pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low + pmull2 v9.1q, v11.2d, v20.2d //GHASH block 8k+3 - high + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 0 + + eor v17.16b, v17.16b, v16.16b //GHASH block 8k+1 - high + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 0 + eor v8.16b, v8.16b, v18.16b //GHASH block 8k, 8k+1 - mid + + pmull v22.1q, v10.1d, v22.1d //GHASH block 8k+2 - low + rev32 v7.16b, v30.16b //CTR block 8k+15 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 0 + +.inst 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b, v9.16b //GHASH block 8k+2, 8k+3 - high + trn1 v29.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid + trn2 v10.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 0 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 0 + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 0 + + pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid + pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid + pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low + + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 1 + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 0 + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 0 + + eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low + eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid + eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid + + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 1 + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 1 + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 1 + + ldp q28, q26, [x8, #32] //load rk2, rk3 +.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low + pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid + + ldr q23, [x3, #80] //load h3l | h3h + ext v23.16b, v23.16b, v23.16b, #8 + ldr q25, [x3, #112] //load h4l | h4h + ext v25.16b, v25.16b, v25.16b, #8 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 1 + pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid + + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 1 + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 1 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 1 + + ldr q20, [x3, #32] //load h1l | h1h + ext v20.16b, v20.16b, v20.16b, #8 + ldr q22, [x3, #64] //load h2l | h2h + ext v22.16b, v22.16b, v22.16b, #8 +.inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid + + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 2 + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 2 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 2 + + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 2 + trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 2 + + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 2 + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 2 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 2 + + pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high + pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low + trn2 v12.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid + + ldp q27, q28, [x8, #64] //load rk4, rk5 + rev64 v15.16b, v15.16b //GHASH block 8k+7 + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 3 + + ldr q21, [x3, #48] //load h2k | h1k + ldr q24, [x3, #96] //load h4k | h3k + pmull2 v10.1q, v13.2d, v23.2d //GHASH block 8k+5 - high + pmull v23.1q, v13.1d, v23.1d //GHASH block 8k+5 - low + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 3 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 3 + trn1 v13.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid + + pmull2 v11.1q, v14.2d, v22.2d //GHASH block 8k+6 - high + pmull v22.1q, v14.1d, v22.1d //GHASH block 8k+6 - low + trn2 v14.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid + + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 3 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 3 + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 3 + + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 3 + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 3 + eor v12.16b, v12.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid + +.inst 0xce082a31 //eor3 v17.16b, v17.16b, v8.16b, v10.16b //GHASH block 8k+4, 8k+5 - high + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 4 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 4 + + eor v14.16b, v14.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 4 + pmull2 v16.1q, v12.2d, v24.2d //GHASH block 8k+4 - mid + + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 4 + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 4 + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 4 + + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 4 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 4 + pmull v24.1q, v12.1d, v24.1d //GHASH block 8k+5 - mid + + pmull2 v12.1q, v15.2d, v20.2d //GHASH block 8k+7 - high + pmull2 v13.1q, v14.2d, v21.2d //GHASH block 8k+6 - mid + pmull v21.1q, v14.1d, v21.1d //GHASH block 8k+7 - mid + + ldp q26, q27, [x8, #96] //load rk6, rk7 +.inst 0xce184252 //eor3 v18.16b, v18.16b, v24.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 5 + + ldr d16, [x10] //MODULO - load modulo constant + pmull v20.1q, v15.1d, v20.1d //GHASH block 8k+7 - low +.inst 0xce195e73 //eor3 v19.16b, v19.16b, v25.16b, v23.16b //GHASH block 8k+4, 8k+5 - low + + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 5 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 5 + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 5 + + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 5 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 5 + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 5 + + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 5 +.inst 0xce153652 //eor3 v18.16b, v18.16b, v21.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid +.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+6, 8k+7 - low + + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 6 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 6 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 6 + +.inst 0xce0b3231 //eor3 v17.16b, v17.16b, v11.16b, v12.16b //GHASH block 8k+6, 8k+7 - high + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 6 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 6 + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 6 + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 6 + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 6 + + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 7 +.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up + ldp q28, q26, [x8, #128] //load rk8, rk9 + + pmull v29.1q, v17.1d, v16.1d //MODULO - top 64b align with mid + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 7 + ext v21.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment + + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 7 + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 7 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 7 + + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 7 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 7 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 7 + +.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid + ldr q27, [x8, #160] //load rk10 + + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 8 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 8 + + pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 8 + ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment + + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 8 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 8 + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 8 + + aese v6.16b, v26.16b //AES block 8k+14 - round 9 + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 8 + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 8 + +.inst 0xce115673 //eor3 v19.16b, v19.16b, v17.16b, v21.16b //MODULO - fold into low + add v30.4s, v30.4s, v31.4s //CTR block 8k+15 + aese v2.16b, v26.16b //AES block 8k+10 - round 9 + + aese v3.16b, v26.16b //AES block 8k+11 - round 9 + aese v5.16b, v26.16b //AES block 8k+13 - round 9 + aese v0.16b, v26.16b //AES block 8k+8 - round 9 + + aese v4.16b, v26.16b //AES block 8k+12 - round 9 + aese v1.16b, v26.16b //AES block 8k+9 - round 9 + aese v7.16b, v26.16b //AES block 8k+15 - round 9 + +.L128_dec_tail: //TAIL + + mov v29.16b, v27.16b + sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process + + cmp x5, #112 + + ldp q24, q25, [x3, #192] //load h8k | h7k + ext v25.16b, v25.16b, v25.16b, #8 + ldr q9, [x0], #16 //AES block 8k+8 - load ciphertext + + ldp q20, q21, [x3, #128] //load h5l | h5h + ext v20.16b, v20.16b, v20.16b, #8 + ext v16.16b, v19.16b, v19.16b, #8 //prepare final partial tag + + ldp q22, q23, [x3, #160] //load h6l | h6h + ext v22.16b, v22.16b, v22.16b, #8 + ext v23.16b, v23.16b, v23.16b, #8 + +.inst 0xce00752c //eor3 v12.16b, v9.16b, v0.16b, v29.16b //AES block 8k+8 - result + b.gt .L128_dec_blocks_more_than_7 + + cmp x5, #96 + mov v7.16b, v6.16b + movi v19.8b, #0 + + movi v17.8b, #0 + mov v6.16b, v5.16b + mov v5.16b, v4.16b + + mov v4.16b, v3.16b + mov v3.16b, v2.16b + mov v2.16b, v1.16b + + movi v18.8b, #0 + sub v30.4s, v30.4s, v31.4s + b.gt .L128_dec_blocks_more_than_6 + + cmp x5, #80 + sub v30.4s, v30.4s, v31.4s + + mov v7.16b, v6.16b + mov v6.16b, v5.16b + mov v5.16b, v4.16b + + mov v4.16b, v3.16b + mov v3.16b, v1.16b + b.gt .L128_dec_blocks_more_than_5 + + cmp x5, #64 + + mov v7.16b, v6.16b + mov v6.16b, v5.16b + mov v5.16b, v4.16b + + mov v4.16b, v1.16b + sub v30.4s, v30.4s, v31.4s + b.gt .L128_dec_blocks_more_than_4 + + sub v30.4s, v30.4s, v31.4s + mov v7.16b, v6.16b + mov v6.16b, v5.16b + + mov v5.16b, v1.16b + cmp x5, #48 + b.gt .L128_dec_blocks_more_than_3 + + sub v30.4s, v30.4s, v31.4s + mov v7.16b, v6.16b + cmp x5, #32 + + ldr q24, [x3, #96] //load h4k | h3k + mov v6.16b, v1.16b + b.gt .L128_dec_blocks_more_than_2 + + cmp x5, #16 + + mov v7.16b, v1.16b + sub v30.4s, v30.4s, v31.4s + b.gt .L128_dec_blocks_more_than_1 + + sub v30.4s, v30.4s, v31.4s + ldr q21, [x3, #48] //load h2k | h1k + b .L128_dec_blocks_less_than_1 +.L128_dec_blocks_more_than_7: //blocks left > 7 + rev64 v8.16b, v9.16b //GHASH final-7 block + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + + ins v18.d[0], v24.d[1] //GHASH final-7 block - mid + + pmull v19.1q, v8.1d, v25.1d //GHASH final-7 block - low + ins v27.d[0], v8.d[1] //GHASH final-7 block - mid + + movi v16.8b, #0 //suppress further partial tag feed in + ldr q9, [x0], #16 //AES final-6 block - load ciphertext + + eor v27.8b, v27.8b, v8.8b //GHASH final-7 block - mid + + pmull2 v17.1q, v8.2d, v25.2d //GHASH final-7 block - high + st1 { v12.16b}, [x2], #16 //AES final-7 block - store result +.inst 0xce01752c //eor3 v12.16b, v9.16b, v1.16b, v29.16b //AES final-6 block - result + + pmull v18.1q, v27.1d, v18.1d //GHASH final-7 block - mid +.L128_dec_blocks_more_than_6: //blocks left > 6 + + rev64 v8.16b, v9.16b //GHASH final-6 block + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + + ins v27.d[0], v8.d[1] //GHASH final-6 block - mid + + eor v27.8b, v27.8b, v8.8b //GHASH final-6 block - mid + + pmull v26.1q, v8.1d, v23.1d //GHASH final-6 block - low + ldr q9, [x0], #16 //AES final-5 block - load ciphertext + movi v16.8b, #0 //suppress further partial tag feed in + + pmull v27.1q, v27.1d, v24.1d //GHASH final-6 block - mid + st1 { v12.16b}, [x2], #16 //AES final-6 block - store result + pmull2 v28.1q, v8.2d, v23.2d //GHASH final-6 block - high + + eor v19.16b, v19.16b, v26.16b //GHASH final-6 block - low + eor v17.16b, v17.16b, v28.16b //GHASH final-6 block - high + + eor v18.16b, v18.16b, v27.16b //GHASH final-6 block - mid +.inst 0xce02752c //eor3 v12.16b, v9.16b, v2.16b, v29.16b //AES final-5 block - result +.L128_dec_blocks_more_than_5: //blocks left > 5 + + rev64 v8.16b, v9.16b //GHASH final-5 block + + ldr q9, [x0], #16 //AES final-4 block - load ciphertext + st1 { v12.16b}, [x2], #16 //AES final-5 block - store result + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + + ins v27.d[0], v8.d[1] //GHASH final-5 block - mid + +.inst 0xce03752c //eor3 v12.16b, v9.16b, v3.16b, v29.16b //AES final-4 block - result + + eor v27.8b, v27.8b, v8.8b //GHASH final-5 block - mid + + ins v27.d[1], v27.d[0] //GHASH final-5 block - mid + pmull v26.1q, v8.1d, v22.1d //GHASH final-5 block - low + movi v16.8b, #0 //suppress further partial tag feed in + + pmull2 v27.1q, v27.2d, v21.2d //GHASH final-5 block - mid + pmull2 v28.1q, v8.2d, v22.2d //GHASH final-5 block - high + eor v19.16b, v19.16b, v26.16b //GHASH final-5 block - low + + eor v18.16b, v18.16b, v27.16b //GHASH final-5 block - mid + eor v17.16b, v17.16b, v28.16b //GHASH final-5 block - high +.L128_dec_blocks_more_than_4: //blocks left > 4 + + rev64 v8.16b, v9.16b //GHASH final-4 block + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + ldr q9, [x0], #16 //AES final-3 block - load ciphertext + + ins v27.d[0], v8.d[1] //GHASH final-4 block - mid + movi v16.8b, #0 //suppress further partial tag feed in + pmull2 v28.1q, v8.2d, v20.2d //GHASH final-4 block - high + + pmull v26.1q, v8.1d, v20.1d //GHASH final-4 block - low + + eor v17.16b, v17.16b, v28.16b //GHASH final-4 block - high + + st1 { v12.16b}, [x2], #16 //AES final-4 block - store result + eor v27.8b, v27.8b, v8.8b //GHASH final-4 block - mid + +.inst 0xce04752c //eor3 v12.16b, v9.16b, v4.16b, v29.16b //AES final-3 block - result + eor v19.16b, v19.16b, v26.16b //GHASH final-4 block - low + + pmull v27.1q, v27.1d, v21.1d //GHASH final-4 block - mid + + eor v18.16b, v18.16b, v27.16b //GHASH final-4 block - mid +.L128_dec_blocks_more_than_3: //blocks left > 3 + + st1 { v12.16b}, [x2], #16 //AES final-3 block - store result + rev64 v8.16b, v9.16b //GHASH final-3 block + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + + ins v27.d[0], v8.d[1] //GHASH final-3 block - mid + + ldr q25, [x3, #112] //load h4l | h4h + ext v25.16b, v25.16b, v25.16b, #8 + ldr q24, [x3, #96] //load h4k | h3k + + eor v27.8b, v27.8b, v8.8b //GHASH final-3 block - mid + + ldr q9, [x0], #16 //AES final-2 block - load ciphertext + + ins v27.d[1], v27.d[0] //GHASH final-3 block - mid + pmull v26.1q, v8.1d, v25.1d //GHASH final-3 block - low + pmull2 v28.1q, v8.2d, v25.2d //GHASH final-3 block - high + + movi v16.8b, #0 //suppress further partial tag feed in +.inst 0xce05752c //eor3 v12.16b, v9.16b, v5.16b, v29.16b //AES final-2 block - result + eor v19.16b, v19.16b, v26.16b //GHASH final-3 block - low + + pmull2 v27.1q, v27.2d, v24.2d //GHASH final-3 block - mid + + eor v17.16b, v17.16b, v28.16b //GHASH final-3 block - high + eor v18.16b, v18.16b, v27.16b //GHASH final-3 block - mid +.L128_dec_blocks_more_than_2: //blocks left > 2 + + rev64 v8.16b, v9.16b //GHASH final-2 block + + st1 { v12.16b}, [x2], #16 //AES final-2 block - store result + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + ldr q23, [x3, #80] //load h3l | h3h + ext v23.16b, v23.16b, v23.16b, #8 + movi v16.8b, #0 //suppress further partial tag feed in + + ins v27.d[0], v8.d[1] //GHASH final-2 block - mid + + eor v27.8b, v27.8b, v8.8b //GHASH final-2 block - mid + + pmull v26.1q, v8.1d, v23.1d //GHASH final-2 block - low + + pmull2 v28.1q, v8.2d, v23.2d //GHASH final-2 block - high + pmull v27.1q, v27.1d, v24.1d //GHASH final-2 block - mid + ldr q9, [x0], #16 //AES final-1 block - load ciphertext + + eor v18.16b, v18.16b, v27.16b //GHASH final-2 block - mid + + eor v19.16b, v19.16b, v26.16b //GHASH final-2 block - low + +.inst 0xce06752c //eor3 v12.16b, v9.16b, v6.16b, v29.16b //AES final-1 block - result + eor v17.16b, v17.16b, v28.16b //GHASH final-2 block - high +.L128_dec_blocks_more_than_1: //blocks left > 1 + + st1 { v12.16b}, [x2], #16 //AES final-1 block - store result + rev64 v8.16b, v9.16b //GHASH final-1 block + + ldr q22, [x3, #64] //load h2l | h2h + ext v22.16b, v22.16b, v22.16b, #8 + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + + movi v16.8b, #0 //suppress further partial tag feed in + + ins v27.d[0], v8.d[1] //GHASH final-1 block - mid + + ldr q9, [x0], #16 //AES final block - load ciphertext + pmull2 v28.1q, v8.2d, v22.2d //GHASH final-1 block - high + + eor v27.8b, v27.8b, v8.8b //GHASH final-1 block - mid + eor v17.16b, v17.16b, v28.16b //GHASH final-1 block - high + ldr q21, [x3, #48] //load h2k | h1k + + ins v27.d[1], v27.d[0] //GHASH final-1 block - mid +.inst 0xce07752c //eor3 v12.16b, v9.16b, v7.16b, v29.16b //AES final block - result + + pmull v26.1q, v8.1d, v22.1d //GHASH final-1 block - low + + pmull2 v27.1q, v27.2d, v21.2d //GHASH final-1 block - mid + + eor v19.16b, v19.16b, v26.16b //GHASH final-1 block - low + + eor v18.16b, v18.16b, v27.16b //GHASH final-1 block - mid +.L128_dec_blocks_less_than_1: //blocks left <= 1 + + and x1, x1, #127 //bit_length %= 128 + + sub x1, x1, #128 //bit_length -= 128 + + neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128]) + + mvn x6, xzr //temp0_x = 0xffffffffffffffff + and x1, x1, #127 //bit_length %= 128 + + lsr x6, x6, x1 //temp0_x is mask for top 64b of last block + cmp x1, #64 + mvn x7, xzr //temp1_x = 0xffffffffffffffff + + csel x13, x7, x6, lt + csel x14, x6, xzr, lt + + mov v0.d[1], x14 + mov v0.d[0], x13 //ctr0b is mask for last block + + ldr q20, [x3, #32] //load h1l | h1h + ext v20.16b, v20.16b, v20.16b, #8 + ld1 { v26.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored + + and v9.16b, v9.16b, v0.16b //possibly partial last block has zeroes in highest bits + + rev64 v8.16b, v9.16b //GHASH final block + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + + pmull2 v28.1q, v8.2d, v20.2d //GHASH final block - high + ins v16.d[0], v8.d[1] //GHASH final block - mid + + eor v17.16b, v17.16b, v28.16b //GHASH final block - high + eor v16.8b, v16.8b, v8.8b //GHASH final block - mid + + bif v12.16b, v26.16b, v0.16b //insert existing bytes in top end of result before storing + + pmull v16.1q, v16.1d, v21.1d //GHASH final block - mid + st1 { v12.16b}, [x2] //store all 16B + + pmull v26.1q, v8.1d, v20.1d //GHASH final block - low + + eor v18.16b, v18.16b, v16.16b //GHASH final block - mid + ldr d16, [x10] //MODULO - load modulo constant + + eor v19.16b, v19.16b, v26.16b //GHASH final block - low + + eor v14.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up + + pmull v21.1q, v17.1d, v16.1d //MODULO - top 64b align with mid + ext v17.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment + + eor v18.16b, v18.16b, v14.16b //MODULO - karatsuba tidy up + +.inst 0xce115652 //eor3 v18.16b, v18.16b, v17.16b, v21.16b //MODULO - fold into mid + + pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low + ext v18.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment + +.inst 0xce124673 //eor3 v19.16b, v19.16b, v18.16b, v17.16b //MODULO - fold into low + ext v19.16b, v19.16b, v19.16b, #8 + rev64 v19.16b, v19.16b + st1 { v19.16b }, [x3] + rev32 v30.16b, v30.16b + + str q30, [x16] //store the updated counter + + mov x0, x9 + + ldp d10, d11, [sp, #16] + ldp d12, d13, [sp, #32] + ldp d14, d15, [sp, #48] + ldp d8, d9, [sp], #80 + ret +.L128_dec_ret: + mov w0, #0x0 + ret +.size unroll8_eor3_aes_gcm_dec_128_kernel,.-unroll8_eor3_aes_gcm_dec_128_kernel +.globl unroll8_eor3_aes_gcm_enc_192_kernel +.type unroll8_eor3_aes_gcm_enc_192_kernel,%function +.align 4 +unroll8_eor3_aes_gcm_enc_192_kernel: + AARCH64_VALID_CALL_TARGET + cbz x1, .L192_enc_ret + stp d8, d9, [sp, #-80]! + lsr x9, x1, #3 + mov x16, x4 + mov x8, x5 + stp d10, d11, [sp, #16] + stp d12, d13, [sp, #32] + stp d14, d15, [sp, #48] + mov x5, #0xc200000000000000 + stp x5, xzr, [sp, #64] + add x10, sp, #64 + + mov x5, x9 + ld1 { v0.16b}, [x16] //CTR block 0 + + mov x15, #0x100000000 //set up counter increment + movi v31.16b, #0x0 + mov v31.d[1], x15 + + rev32 v30.16b, v0.16b //set up reversed counter + + add v30.4s, v30.4s, v31.4s //CTR block 0 + + rev32 v1.16b, v30.16b //CTR block 1 + add v30.4s, v30.4s, v31.4s //CTR block 1 + + rev32 v2.16b, v30.16b //CTR block 2 + add v30.4s, v30.4s, v31.4s //CTR block 2 + + rev32 v3.16b, v30.16b //CTR block 3 + add v30.4s, v30.4s, v31.4s //CTR block 3 + + rev32 v4.16b, v30.16b //CTR block 4 + add v30.4s, v30.4s, v31.4s //CTR block 4 + sub x5, x5, #1 //byte_len - 1 + + and x5, x5, #0xffffffffffffff80 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail) + + rev32 v5.16b, v30.16b //CTR block 5 + add v30.4s, v30.4s, v31.4s //CTR block 5 + ldp q26, q27, [x8, #0] //load rk0, rk1 + + add x5, x5, x0 + + rev32 v6.16b, v30.16b //CTR block 6 + add v30.4s, v30.4s, v31.4s //CTR block 6 + + rev32 v7.16b, v30.16b //CTR block 7 + + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 5 - round 0 + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 4 - round 0 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 3 - round 0 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 0 - round 0 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 1 - round 0 + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 7 - round 0 + + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 6 - round 0 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 2 - round 0 + ldp q28, q26, [x8, #32] //load rk2, rk3 + + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 5 - round 1 + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 7 - round 1 + + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 2 - round 1 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 3 - round 1 + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 6 - round 1 + + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 5 - round 2 + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 4 - round 1 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 0 - round 1 + + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 1 - round 1 + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 7 - round 2 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 3 - round 2 + + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 2 - round 2 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 0 - round 2 + + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 1 - round 2 + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 4 - round 2 + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 6 - round 2 + + ldp q27, q28, [x8, #64] //load rk4, rk5 + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 4 - round 3 + + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 7 - round 3 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 3 - round 3 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 2 - round 3 + + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 1 - round 3 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 0 - round 3 + + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 6 - round 3 + + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 0 - round 4 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 1 - round 4 + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 5 - round 3 + + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 3 - round 4 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 2 - round 4 + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 4 - round 4 + + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 6 - round 4 + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 7 - round 4 + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 5 - round 4 + + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 1 - round 5 + ldp q26, q27, [x8, #96] //load rk6, rk7 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 2 - round 5 + + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 4 - round 5 + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 7 - round 5 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 0 - round 5 + + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 5 - round 5 + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 6 - round 5 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 3 - round 5 + + add v30.4s, v30.4s, v31.4s //CTR block 7 + + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 5 - round 6 + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 4 - round 6 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 3 - round 6 + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 2 - round 6 + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 6 - round 6 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 1 - round 6 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 0 - round 6 + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 7 - round 6 + ldp q28, q26, [x8, #128] //load rk8, rk9 + + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 6 - round 7 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 3 - round 7 + + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 4 - round 7 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 0 - round 7 + + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 7 - round 7 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 1 - round 7 + + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 2 - round 7 + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 5 - round 7 + + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 7 - round 8 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 0 - round 8 + + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 4 - round 8 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 3 - round 8 + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 5 - round 8 + + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 2 - round 8 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 1 - round 8 + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 6 - round 8 + + add x4, x0, x1, lsr #3 //end_input_ptr + cmp x0, x5 //check if we have <= 8 blocks + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 3 - round 9 + + ld1 { v19.16b}, [x3] + ext v19.16b, v19.16b, v19.16b, #8 + rev64 v19.16b, v19.16b + ldp q27, q28, [x8, #160] //load rk10, rk11 + + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 6 - round 9 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 1 - round 9 + + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 5 - round 9 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 2 - round 9 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 0 - round 9 + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 4 - round 9 + + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 14 - round 10 + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 7 - round 9 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 11 - round 10 + + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 9 - round 10 + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 13 - round 10 + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 12 - round 10 + + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8 - round 10 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 10 - round 10 + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 15 - round 10 + + aese v6.16b, v28.16b //AES block 14 - round 11 + aese v3.16b, v28.16b //AES block 11 - round 11 + + aese v4.16b, v28.16b //AES block 12 - round 11 + aese v7.16b, v28.16b //AES block 15 - round 11 + ldr q26, [x8, #192] //load rk12 + + aese v1.16b, v28.16b //AES block 9 - round 11 + aese v5.16b, v28.16b //AES block 13 - round 11 + + aese v2.16b, v28.16b //AES block 10 - round 11 + aese v0.16b, v28.16b //AES block 8 - round 11 + b.ge .L192_enc_tail //handle tail + + ldp q8, q9, [x0], #32 //AES block 0, 1 - load plaintext + + ldp q10, q11, [x0], #32 //AES block 2, 3 - load plaintext + + ldp q12, q13, [x0], #32 //AES block 4, 5 - load plaintext + + ldp q14, q15, [x0], #32 //AES block 6, 7 - load plaintext + +.inst 0xce006908 //eor3 v8.16b, v8.16b, v0.16b, v26.16b //AES block 0 - result + rev32 v0.16b, v30.16b //CTR block 8 + add v30.4s, v30.4s, v31.4s //CTR block 8 + +.inst 0xce03696b //eor3 v11.16b, v11.16b, v3.16b, v26.16b //AES block 3 - result +.inst 0xce016929 //eor3 v9.16b, v9.16b, v1.16b, v26.16b //AES block 1 - result + + rev32 v1.16b, v30.16b //CTR block 9 + add v30.4s, v30.4s, v31.4s //CTR block 9 +.inst 0xce04698c //eor3 v12.16b, v12.16b, v4.16b, v26.16b //AES block 4 - result + +.inst 0xce0569ad //eor3 v13.16b, v13.16b, v5.16b, v26.16b //AES block 5 - result +.inst 0xce0769ef //eor3 v15.16b, v15.16b, v7.16b, v26.16b //AES block 7 - result + stp q8, q9, [x2], #32 //AES block 0, 1 - store result + +.inst 0xce02694a //eor3 v10.16b, v10.16b, v2.16b, v26.16b //AES block 2 - result + rev32 v2.16b, v30.16b //CTR block 10 + add v30.4s, v30.4s, v31.4s //CTR block 10 + + stp q10, q11, [x2], #32 //AES block 2, 3 - store result + cmp x0, x5 //check if we have <= 8 blocks + + rev32 v3.16b, v30.16b //CTR block 11 + add v30.4s, v30.4s, v31.4s //CTR block 11 +.inst 0xce0669ce //eor3 v14.16b, v14.16b, v6.16b, v26.16b //AES block 6 - result + + stp q12, q13, [x2], #32 //AES block 4, 5 - store result + + rev32 v4.16b, v30.16b //CTR block 12 + stp q14, q15, [x2], #32 //AES block 6, 7 - store result + add v30.4s, v30.4s, v31.4s //CTR block 12 + + b.ge .L192_enc_prepretail //do prepretail + +.L192_enc_main_loop: //main loop start + rev64 v12.16b, v12.16b //GHASH block 8k+4 (t0, t1, and t2 free) + ldp q26, q27, [x8, #0] //load rk0, rk1 + rev64 v10.16b, v10.16b //GHASH block 8k+2 + + rev32 v5.16b, v30.16b //CTR block 8k+13 + add v30.4s, v30.4s, v31.4s //CTR block 8k+13 + ldr q23, [x3, #176] //load h7l | h7h + ext v23.16b, v23.16b, v23.16b, #8 + ldr q25, [x3, #208] //load h8l | h8h + ext v25.16b, v25.16b, v25.16b, #8 + + ext v19.16b, v19.16b, v19.16b, #8 //PRE 0 + rev64 v8.16b, v8.16b //GHASH block 8k + ldr q20, [x3, #128] //load h5l | h5h + ext v20.16b, v20.16b, v20.16b, #8 + ldr q22, [x3, #160] //load h6l | h6h + ext v22.16b, v22.16b, v22.16b, #8 + + rev64 v9.16b, v9.16b //GHASH block 8k+1 + rev32 v6.16b, v30.16b //CTR block 8k+14 + add v30.4s, v30.4s, v31.4s //CTR block 8k+14 + + eor v8.16b, v8.16b, v19.16b //PRE 1 + rev64 v11.16b, v11.16b //GHASH block 8k+3 + rev64 v13.16b, v13.16b //GHASH block 8k+5 (t0, t1, t2 and t3 free) + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 0 + rev32 v7.16b, v30.16b //CTR block 8k+15 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 0 + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 0 + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 0 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 0 + + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 0 + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 0 + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 0 + + ldp q28, q26, [x8, #32] //load rk2, rk3 + pmull2 v17.1q, v8.2d, v25.2d //GHASH block 8k - high + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 1 + + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 1 + pmull2 v16.1q, v9.2d, v23.2d //GHASH block 8k+1 - high + pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low + + trn1 v18.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 1 + ldr q21, [x3, #144] //load h6k | h5k + ldr q24, [x3, #192] //load h8k | h7k + + pmull2 v29.1q, v10.2d, v22.2d //GHASH block 8k+2 - high + pmull v19.1q, v8.1d, v25.1d //GHASH block 8k - low + trn2 v8.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid + + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 1 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 1 + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 1 + + eor v17.16b, v17.16b, v16.16b //GHASH block 8k+1 - high + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 1 + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 1 + + pmull2 v9.1q, v11.2d, v20.2d //GHASH block 8k+3 - high + eor v8.16b, v8.16b, v18.16b //GHASH block 8k, 8k+1 - mid + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 2 + + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 2 + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 2 + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 2 + + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 2 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 3 +.inst 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b, v9.16b //GHASH block 8k+2, 8k+3 - high + + pmull v22.1q, v10.1d, v22.1d //GHASH block 8k+2 - low + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 2 + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 3 + + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 2 + trn1 v29.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 2 + + trn2 v10.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 3 + ldp q27, q28, [x8, #64] //load rk4, rk5 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 3 + eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low + ldr q23, [x3, #80] //load h3l | h3h + ext v23.16b, v23.16b, v23.16b, #8 + ldr q25, [x3, #112] //load h4l | h4h + ext v25.16b, v25.16b, v25.16b, #8 + + pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid + pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid + pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low + + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 3 + eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid + trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid + + eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 3 +.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low + + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 4 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 4 + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 3 + + pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 4 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 3 + + pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 4 + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 4 + + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 4 + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 4 + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 4 + +.inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 5 + ldr q20, [x3, #32] //load h1l | h1h + ext v20.16b, v20.16b, v20.16b, #8 + ldr q22, [x3, #64] //load h2l | h2h + ext v22.16b, v22.16b, v22.16b, #8 + + ldp q26, q27, [x8, #96] //load rk6, rk7 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 5 + rev64 v15.16b, v15.16b //GHASH block 8k+7 (t0, t1, t2 and t3 free) + + rev64 v14.16b, v14.16b //GHASH block 8k+6 (t0, t1, and t2 free) + pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high + pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low + + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 5 + trn2 v12.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid + + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 5 + ldr q21, [x3, #48] //load h2k | h1k + ldr q24, [x3, #96] //load h4k | h3k + + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 5 + pmull2 v10.1q, v13.2d, v23.2d //GHASH block 8k+5 - high + eor v12.16b, v12.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid + + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 5 + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 5 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 5 + + pmull v23.1q, v13.1d, v23.1d //GHASH block 8k+5 - low + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 6 + trn1 v13.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 6 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 6 + pmull2 v11.1q, v14.2d, v22.2d //GHASH block 8k+6 - high + + pmull v22.1q, v14.1d, v22.1d //GHASH block 8k+6 - low + trn2 v14.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 6 + + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 6 + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 6 + + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 6 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 7 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 6 + + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 7 + eor v14.16b, v14.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid + + pmull2 v16.1q, v12.2d, v24.2d //GHASH block 8k+4 - mid + ldp q28, q26, [x8, #128] //load rk8, rk9 + pmull v24.1q, v12.1d, v24.1d //GHASH block 8k+5 - mid + + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 7 + pmull2 v12.1q, v15.2d, v20.2d //GHASH block 8k+7 - high + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 7 + +.inst 0xce184252 //eor3 v18.16b, v18.16b, v24.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 7 + add v30.4s, v30.4s, v31.4s //CTR block 8k+15 + + ldr d16, [x10] //MODULO - load modulo constant +.inst 0xce082a31 //eor3 v17.16b, v17.16b, v8.16b, v10.16b //GHASH block 8k+4, 8k+5 - high + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 7 + + pmull2 v13.1q, v14.2d, v21.2d //GHASH block 8k+6 - mid + pmull v20.1q, v15.1d, v20.1d //GHASH block 8k+7 - low + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 7 + + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 8 + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 8 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 8 + + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 8 +.inst 0xce195e73 //eor3 v19.16b, v19.16b, v25.16b, v23.16b //GHASH block 8k+4, 8k+5 - low + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 7 + + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 8 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 8 + pmull v21.1q, v14.1d, v21.1d //GHASH block 8k+7 - mid + + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 8 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 8 + ldp q27, q28, [x8, #160] //load rk10, rk11 + +.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+6, 8k+7 - low + rev32 v20.16b, v30.16b //CTR block 8k+16 + add v30.4s, v30.4s, v31.4s //CTR block 8k+16 + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 9 +.inst 0xce153652 //eor3 v18.16b, v18.16b, v21.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid +.inst 0xce0b3231 //eor3 v17.16b, v17.16b, v11.16b, v12.16b //GHASH block 8k+6, 8k+7 - high + + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 9 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 9 + ldp q8, q9, [x0], #32 //AES block 8k+8, 8k+9 - load plaintext + + pmull v21.1q, v17.1d, v16.1d //MODULO - top 64b align with mid + rev32 v22.16b, v30.16b //CTR block 8k+17 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 9 + + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 9 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 9 + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 9 + +.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 9 + add v30.4s, v30.4s, v31.4s //CTR block 8k+17 + + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 10 + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 10 + ldr q26, [x8, #192] //load rk12 + ext v29.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment + + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 10 + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 10 + ldp q10, q11, [x0], #32 //AES block 8k+10, 8k+11 - load plaintext + + aese v4.16b, v28.16b //AES block 8k+12 - round 11 +.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid + ldp q12, q13, [x0], #32 //AES block 8k+12, 8k+13 - load plaintext + + ldp q14, q15, [x0], #32 //AES block 8k+14, 8k+15 - load plaintext + aese v2.16b, v28.16b //AES block 8k+10 - round 11 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 10 + + rev32 v23.16b, v30.16b //CTR block 8k+18 + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 10 + + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 10 + pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low + + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 10 + aese v5.16b, v28.16b //AES block 8k+13 - round 11 + add v30.4s, v30.4s, v31.4s //CTR block 8k+18 + + aese v7.16b, v28.16b //AES block 8k+15 - round 11 + aese v0.16b, v28.16b //AES block 8k+8 - round 11 +.inst 0xce04698c //eor3 v12.16b, v12.16b, v4.16b, v26.16b //AES block 4 - result + + aese v6.16b, v28.16b //AES block 8k+14 - round 11 + aese v3.16b, v28.16b //AES block 8k+11 - round 11 + aese v1.16b, v28.16b //AES block 8k+9 - round 11 + + rev32 v25.16b, v30.16b //CTR block 8k+19 + add v30.4s, v30.4s, v31.4s //CTR block 8k+19 +.inst 0xce0769ef //eor3 v15.16b, v15.16b, v7.16b, v26.16b //AES block 7 - result + +.inst 0xce02694a //eor3 v10.16b, v10.16b, v2.16b, v26.16b //AES block 8k+10 - result +.inst 0xce006908 //eor3 v8.16b, v8.16b, v0.16b, v26.16b //AES block 8k+8 - result + mov v2.16b, v23.16b //CTR block 8k+18 + +.inst 0xce016929 //eor3 v9.16b, v9.16b, v1.16b, v26.16b //AES block 8k+9 - result + mov v1.16b, v22.16b //CTR block 8k+17 + stp q8, q9, [x2], #32 //AES block 8k+8, 8k+9 - store result + ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment + +.inst 0xce0669ce //eor3 v14.16b, v14.16b, v6.16b, v26.16b //AES block 6 - result + mov v0.16b, v20.16b //CTR block 8k+16 + rev32 v4.16b, v30.16b //CTR block 8k+20 + + add v30.4s, v30.4s, v31.4s //CTR block 8k+20 +.inst 0xce0569ad //eor3 v13.16b, v13.16b, v5.16b, v26.16b //AES block 5 - result +.inst 0xce115673 //eor3 v19.16b, v19.16b, v17.16b, v21.16b //MODULO - fold into low + +.inst 0xce03696b //eor3 v11.16b, v11.16b, v3.16b, v26.16b //AES block 8k+11 - result + mov v3.16b, v25.16b //CTR block 8k+19 + + stp q10, q11, [x2], #32 //AES block 8k+10, 8k+11 - store result + + stp q12, q13, [x2], #32 //AES block 8k+12, 8k+13 - store result + + cmp x0, x5 //.LOOP CONTROL + stp q14, q15, [x2], #32 //AES block 8k+14, 8k+15 - store result + b.lt .L192_enc_main_loop + +.L192_enc_prepretail: //PREPRETAIL + rev32 v5.16b, v30.16b //CTR block 8k+13 + ldp q26, q27, [x8, #0] //load rk0, rk1 + add v30.4s, v30.4s, v31.4s //CTR block 8k+13 + + ldr q23, [x3, #176] //load h7l | h7h + ext v23.16b, v23.16b, v23.16b, #8 + ldr q25, [x3, #208] //load h8l | h8h + ext v25.16b, v25.16b, v25.16b, #8 + rev64 v8.16b, v8.16b //GHASH block 8k + ext v19.16b, v19.16b, v19.16b, #8 //PRE 0 + + rev32 v6.16b, v30.16b //CTR block 8k+14 + add v30.4s, v30.4s, v31.4s //CTR block 8k+14 + ldr q21, [x3, #144] //load h6k | h5k + ldr q24, [x3, #192] //load h8k | h7k + + rev64 v11.16b, v11.16b //GHASH block 8k+3 + rev64 v10.16b, v10.16b //GHASH block 8k+2 + ldr q20, [x3, #128] //load h5l | h5h + ext v20.16b, v20.16b, v20.16b, #8 + ldr q22, [x3, #160] //load h6l | h6h + ext v22.16b, v22.16b, v22.16b, #8 + + eor v8.16b, v8.16b, v19.16b //PRE 1 + rev32 v7.16b, v30.16b //CTR block 8k+15 + rev64 v9.16b, v9.16b //GHASH block 8k+1 + + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 0 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 0 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 0 + + pmull2 v16.1q, v9.2d, v23.2d //GHASH block 8k+1 - high + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 0 + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 0 + + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 0 + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 0 + pmull2 v17.1q, v8.2d, v25.2d //GHASH block 8k - high + + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 1 + pmull v19.1q, v8.1d, v25.1d //GHASH block 8k - low + trn1 v18.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid + + trn2 v8.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 0 + ldp q28, q26, [x8, #32] //load rk2, rk3 + + pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low + eor v17.16b, v17.16b, v16.16b //GHASH block 8k+1 - high + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 1 + + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 1 + eor v8.16b, v8.16b, v18.16b //GHASH block 8k, 8k+1 - mid + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 1 + + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 1 + pmull2 v9.1q, v11.2d, v20.2d //GHASH block 8k+3 - high + pmull2 v29.1q, v10.2d, v22.2d //GHASH block 8k+2 - high + + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 1 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 1 + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 1 + + pmull v22.1q, v10.1d, v22.1d //GHASH block 8k+2 - low + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 2 + eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low + + pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 2 +.inst 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b, v9.16b //GHASH block 8k+2, 8k+3 - high + + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 3 + trn1 v29.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 2 + + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 2 + pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid + trn2 v10.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid + + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 2 + rev64 v13.16b, v13.16b //GHASH block 8k+5 (t0, t1, t2 and t3 free) + rev64 v14.16b, v14.16b //GHASH block 8k+6 (t0, t1, and t2 free) + + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 2 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 2 + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 2 + + eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid + pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid + ldp q27, q28, [x8, #64] //load rk4, rk5 + + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 3 + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 3 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 3 + + eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid +.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 3 + + ldr q23, [x3, #80] //load h3l | h3h + ext v23.16b, v23.16b, v23.16b, #8 + ldr q25, [x3, #112] //load h4l | h4h + ext v25.16b, v25.16b, v25.16b, #8 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 3 + pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid + + ldr q20, [x3, #32] //load h1l | h1h + ext v20.16b, v20.16b, v20.16b, #8 + ldr q22, [x3, #64] //load h2l | h2h + ext v22.16b, v22.16b, v22.16b, #8 + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 3 + rev64 v12.16b, v12.16b //GHASH block 8k+4 (t0, t1, and t2 free) + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 3 + pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 4 + + trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 4 + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 4 + +.inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 4 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 4 + + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 4 + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 4 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 4 + + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 5 + rev64 v15.16b, v15.16b //GHASH block 8k+7 (t0, t1, t2 and t3 free) + ldr q21, [x3, #48] //load h2k | h1k + ldr q24, [x3, #96] //load h4k | h3k + + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 5 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 5 + ldp q26, q27, [x8, #96] //load rk6, rk7 + + pmull2 v11.1q, v14.2d, v22.2d //GHASH block 8k+6 - high + pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high + pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low + + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 5 + trn2 v12.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid + + pmull2 v10.1q, v13.2d, v23.2d //GHASH block 8k+5 - high + pmull v23.1q, v13.1d, v23.1d //GHASH block 8k+5 - low + pmull v22.1q, v14.1d, v22.1d //GHASH block 8k+6 - low + + trn1 v13.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid + eor v12.16b, v12.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid + trn2 v14.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid + + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 5 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 6 + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 5 + + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 5 + eor v14.16b, v14.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 5 + + pmull2 v16.1q, v12.2d, v24.2d //GHASH block 8k+4 - mid + pmull v24.1q, v12.1d, v24.1d //GHASH block 8k+5 - mid + + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 6 + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 6 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 7 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 6 + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 6 +.inst 0xce184252 //eor3 v18.16b, v18.16b, v24.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 6 +.inst 0xce082a31 //eor3 v17.16b, v17.16b, v8.16b, v10.16b //GHASH block 8k+4, 8k+5 - high + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 7 + + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 6 + ldr d16, [x10] //MODULO - load modulo constant + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 6 + + pmull2 v13.1q, v14.2d, v21.2d //GHASH block 8k+6 - mid + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 7 +.inst 0xce195e73 //eor3 v19.16b, v19.16b, v25.16b, v23.16b //GHASH block 8k+4, 8k+5 - low + + pmull2 v12.1q, v15.2d, v20.2d //GHASH block 8k+7 - high + pmull v21.1q, v14.1d, v21.1d //GHASH block 8k+7 - mid + pmull v20.1q, v15.1d, v20.1d //GHASH block 8k+7 - low + + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 7 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 7 + ldp q28, q26, [x8, #128] //load rk8, rk9 + + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 7 +.inst 0xce153652 //eor3 v18.16b, v18.16b, v21.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid + +.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+6, 8k+7 - low +.inst 0xce0b3231 //eor3 v17.16b, v17.16b, v11.16b, v12.16b //GHASH block 8k+6, 8k+7 - high + +.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up + ext v29.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 7 + pmull v21.1q, v17.1d, v16.1d //MODULO - top 64b align with mid + + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 8 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 8 + + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 7 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 8 +.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid + + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 8 + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 9 + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 8 + + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 8 + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 8 + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 8 + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 9 + ldp q27, q28, [x8, #160] //load rk10, rk11 + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 9 + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 9 + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 9 + + ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 9 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 9 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 9 + + pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low + ldr q26, [x8, #192] //load rk12 + + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 10 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 10 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 10 + +.inst 0xce115673 //eor3 v19.16b, v19.16b, v17.16b, v21.16b //MODULO - fold into low + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 10 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 10 + + aese v1.16b, v28.16b //AES block 8k+9 - round 11 + aese v7.16b, v28.16b //AES block 8k+15 - round 11 + + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 10 + aese v3.16b, v28.16b //AES block 8k+11 - round 11 + + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 10 + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 10 + + add v30.4s, v30.4s, v31.4s //CTR block 8k+15 + aese v2.16b, v28.16b //AES block 8k+10 - round 11 + aese v0.16b, v28.16b //AES block 8k+8 - round 11 + + aese v6.16b, v28.16b //AES block 8k+14 - round 11 + aese v4.16b, v28.16b //AES block 8k+12 - round 11 + aese v5.16b, v28.16b //AES block 8k+13 - round 11 + +.L192_enc_tail: //TAIL + + ldp q20, q21, [x3, #128] //load h5l | h5h + ext v20.16b, v20.16b, v20.16b, #8 + sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process + + ldr q8, [x0], #16 //AES block 8k+8 - l3ad plaintext + + ldp q24, q25, [x3, #192] //load h8k | h7k + ext v25.16b, v25.16b, v25.16b, #8 + + mov v29.16b, v26.16b + + ldp q22, q23, [x3, #160] //load h6l | h6h + ext v22.16b, v22.16b, v22.16b, #8 + ext v23.16b, v23.16b, v23.16b, #8 + cmp x5, #112 + +.inst 0xce007509 //eor3 v9.16b, v8.16b, v0.16b, v29.16b //AES block 8k+8 - result + ext v16.16b, v19.16b, v19.16b, #8 //prepare final partial tag + b.gt .L192_enc_blocks_more_than_7 + + cmp x5, #96 + mov v7.16b, v6.16b + movi v17.8b, #0 + + mov v6.16b, v5.16b + movi v19.8b, #0 + sub v30.4s, v30.4s, v31.4s + + mov v5.16b, v4.16b + mov v4.16b, v3.16b + mov v3.16b, v2.16b + + mov v2.16b, v1.16b + movi v18.8b, #0 + b.gt .L192_enc_blocks_more_than_6 + + mov v7.16b, v6.16b + cmp x5, #80 + + mov v6.16b, v5.16b + mov v5.16b, v4.16b + mov v4.16b, v3.16b + + mov v3.16b, v1.16b + sub v30.4s, v30.4s, v31.4s + b.gt .L192_enc_blocks_more_than_5 + + cmp x5, #64 + sub v30.4s, v30.4s, v31.4s + + mov v7.16b, v6.16b + mov v6.16b, v5.16b + mov v5.16b, v4.16b + + mov v4.16b, v1.16b + b.gt .L192_enc_blocks_more_than_4 + + mov v7.16b, v6.16b + mov v6.16b, v5.16b + mov v5.16b, v1.16b + + sub v30.4s, v30.4s, v31.4s + cmp x5, #48 + b.gt .L192_enc_blocks_more_than_3 + + mov v7.16b, v6.16b + mov v6.16b, v1.16b + sub v30.4s, v30.4s, v31.4s + + ldr q24, [x3, #96] //load h4k | h3k + cmp x5, #32 + b.gt .L192_enc_blocks_more_than_2 + + sub v30.4s, v30.4s, v31.4s + + cmp x5, #16 + mov v7.16b, v1.16b + b.gt .L192_enc_blocks_more_than_1 + + sub v30.4s, v30.4s, v31.4s + ldr q21, [x3, #48] //load h2k | h1k + b .L192_enc_blocks_less_than_1 +.L192_enc_blocks_more_than_7: //blocks left > 7 + st1 { v9.16b}, [x2], #16 //AES final-7 block - store result + + rev64 v8.16b, v9.16b //GHASH final-7 block + ins v18.d[0], v24.d[1] //GHASH final-7 block - mid + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + + ins v27.d[0], v8.d[1] //GHASH final-7 block - mid + + ldr q9, [x0], #16 //AES final-6 block - load plaintext + + eor v27.8b, v27.8b, v8.8b //GHASH final-7 block - mid + movi v16.8b, #0 //suppress further partial tag feed in + pmull v19.1q, v8.1d, v25.1d //GHASH final-7 block - low + + pmull2 v17.1q, v8.2d, v25.2d //GHASH final-7 block - high + + pmull v18.1q, v27.1d, v18.1d //GHASH final-7 block - mid +.inst 0xce017529 //eor3 v9.16b, v9.16b, v1.16b, v29.16b //AES final-6 block - result +.L192_enc_blocks_more_than_6: //blocks left > 6 + + st1 { v9.16b}, [x2], #16 //AES final-6 block - store result + + rev64 v8.16b, v9.16b //GHASH final-6 block + + ldr q9, [x0], #16 //AES final-5 block - load plaintext + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + + ins v27.d[0], v8.d[1] //GHASH final-6 block - mid + + pmull v26.1q, v8.1d, v23.1d //GHASH final-6 block - low +.inst 0xce027529 //eor3 v9.16b, v9.16b, v2.16b, v29.16b //AES final-5 block - result + + movi v16.8b, #0 //suppress further partial tag feed in + pmull2 v28.1q, v8.2d, v23.2d //GHASH final-6 block - high + eor v27.8b, v27.8b, v8.8b //GHASH final-6 block - mid + + pmull v27.1q, v27.1d, v24.1d //GHASH final-6 block - mid + + eor v17.16b, v17.16b, v28.16b //GHASH final-6 block - high + eor v19.16b, v19.16b, v26.16b //GHASH final-6 block - low + + eor v18.16b, v18.16b, v27.16b //GHASH final-6 block - mid +.L192_enc_blocks_more_than_5: //blocks left > 5 + + st1 { v9.16b}, [x2], #16 //AES final-5 block - store result + + rev64 v8.16b, v9.16b //GHASH final-5 block + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + + ins v27.d[0], v8.d[1] //GHASH final-5 block - mid + + ldr q9, [x0], #16 //AES final-4 block - load plaintext + pmull2 v28.1q, v8.2d, v22.2d //GHASH final-5 block - high + + eor v27.8b, v27.8b, v8.8b //GHASH final-5 block - mid + eor v17.16b, v17.16b, v28.16b //GHASH final-5 block - high + + ins v27.d[1], v27.d[0] //GHASH final-5 block - mid + pmull v26.1q, v8.1d, v22.1d //GHASH final-5 block - low + + eor v19.16b, v19.16b, v26.16b //GHASH final-5 block - low + pmull2 v27.1q, v27.2d, v21.2d //GHASH final-5 block - mid + +.inst 0xce037529 //eor3 v9.16b, v9.16b, v3.16b, v29.16b //AES final-4 block - result + movi v16.8b, #0 //suppress further partial tag feed in + + eor v18.16b, v18.16b, v27.16b //GHASH final-5 block - mid +.L192_enc_blocks_more_than_4: //blocks left > 4 + + st1 { v9.16b}, [x2], #16 //AES final-4 block - store result + + rev64 v8.16b, v9.16b //GHASH final-4 block + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + + ldr q9, [x0], #16 //AES final-3 block - load plaintext + pmull2 v28.1q, v8.2d, v20.2d //GHASH final-4 block - high + ins v27.d[0], v8.d[1] //GHASH final-4 block - mid + + pmull v26.1q, v8.1d, v20.1d //GHASH final-4 block - low + eor v17.16b, v17.16b, v28.16b //GHASH final-4 block - high + + eor v27.8b, v27.8b, v8.8b //GHASH final-4 block - mid + + movi v16.8b, #0 //suppress further partial tag feed in + eor v19.16b, v19.16b, v26.16b //GHASH final-4 block - low + + pmull v27.1q, v27.1d, v21.1d //GHASH final-4 block - mid + + eor v18.16b, v18.16b, v27.16b //GHASH final-4 block - mid +.inst 0xce047529 //eor3 v9.16b, v9.16b, v4.16b, v29.16b //AES final-3 block - result +.L192_enc_blocks_more_than_3: //blocks left > 3 + + ldr q24, [x3, #96] //load h4k | h3k + st1 { v9.16b}, [x2], #16 //AES final-3 block - store result + + rev64 v8.16b, v9.16b //GHASH final-3 block + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + movi v16.8b, #0 //suppress further partial tag feed in + + ldr q9, [x0], #16 //AES final-2 block - load plaintext + ldr q25, [x3, #112] //load h4l | h4h + ext v25.16b, v25.16b, v25.16b, #8 + + ins v27.d[0], v8.d[1] //GHASH final-3 block - mid + +.inst 0xce057529 //eor3 v9.16b, v9.16b, v5.16b, v29.16b //AES final-2 block - result + eor v27.8b, v27.8b, v8.8b //GHASH final-3 block - mid + + ins v27.d[1], v27.d[0] //GHASH final-3 block - mid + pmull v26.1q, v8.1d, v25.1d //GHASH final-3 block - low + + pmull2 v28.1q, v8.2d, v25.2d //GHASH final-3 block - high + pmull2 v27.1q, v27.2d, v24.2d //GHASH final-3 block - mid + + eor v19.16b, v19.16b, v26.16b //GHASH final-3 block - low + + eor v18.16b, v18.16b, v27.16b //GHASH final-3 block - mid + eor v17.16b, v17.16b, v28.16b //GHASH final-3 block - high +.L192_enc_blocks_more_than_2: //blocks left > 2 + + st1 { v9.16b}, [x2], #16 //AES final-2 block - store result + + rev64 v8.16b, v9.16b //GHASH final-2 block + ldr q23, [x3, #80] //load h3l | h3h + ext v23.16b, v23.16b, v23.16b, #8 + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + + ldr q9, [x0], #16 //AES final-1 block - load plaintext + ins v27.d[0], v8.d[1] //GHASH final-2 block - mid + + eor v27.8b, v27.8b, v8.8b //GHASH final-2 block - mid + + pmull v26.1q, v8.1d, v23.1d //GHASH final-2 block - low + pmull2 v28.1q, v8.2d, v23.2d //GHASH final-2 block - high + movi v16.8b, #0 //suppress further partial tag feed in + + pmull v27.1q, v27.1d, v24.1d //GHASH final-2 block - mid + + eor v19.16b, v19.16b, v26.16b //GHASH final-2 block - low + eor v17.16b, v17.16b, v28.16b //GHASH final-2 block - high + + eor v18.16b, v18.16b, v27.16b //GHASH final-2 block - mid +.inst 0xce067529 //eor3 v9.16b, v9.16b, v6.16b, v29.16b //AES final-1 block - result +.L192_enc_blocks_more_than_1: //blocks left > 1 + + ldr q22, [x3, #64] //load h1l | h1h + ext v22.16b, v22.16b, v22.16b, #8 + st1 { v9.16b}, [x2], #16 //AES final-1 block - store result + + rev64 v8.16b, v9.16b //GHASH final-1 block + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + + ins v27.d[0], v8.d[1] //GHASH final-1 block - mid + pmull v26.1q, v8.1d, v22.1d //GHASH final-1 block - low + + eor v19.16b, v19.16b, v26.16b //GHASH final-1 block - low + pmull2 v28.1q, v8.2d, v22.2d //GHASH final-1 block - high + eor v27.8b, v27.8b, v8.8b //GHASH final-1 block - mid + + ldr q9, [x0], #16 //AES final block - load plaintext + ldr q21, [x3, #48] //load h2k | h1k + + ins v27.d[1], v27.d[0] //GHASH final-1 block - mid + +.inst 0xce077529 //eor3 v9.16b, v9.16b, v7.16b, v29.16b //AES final block - result + pmull2 v27.1q, v27.2d, v21.2d //GHASH final-1 block - mid + + movi v16.8b, #0 //suppress further partial tag feed in + + eor v18.16b, v18.16b, v27.16b //GHASH final-1 block - mid + eor v17.16b, v17.16b, v28.16b //GHASH final-1 block - high +.L192_enc_blocks_less_than_1: //blocks left <= 1 + + mvn x6, xzr //temp0_x = 0xffffffffffffffff + and x1, x1, #127 //bit_length %= 128 + + sub x1, x1, #128 //bit_length -= 128 + + neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128]) + + and x1, x1, #127 //bit_length %= 128 + + lsr x6, x6, x1 //temp0_x is mask for top 64b of last block + cmp x1, #64 + mvn x7, xzr //temp1_x = 0xffffffffffffffff + + csel x13, x7, x6, lt + csel x14, x6, xzr, lt + + mov v0.d[1], x14 + ldr q20, [x3, #32] //load h1l | h1h + ext v20.16b, v20.16b, v20.16b, #8 + + ld1 { v26.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored + mov v0.d[0], x13 //ctr0b is mask for last block + + and v9.16b, v9.16b, v0.16b //possibly partial last block has zeroes in highest bits + + rev64 v8.16b, v9.16b //GHASH final block + bif v9.16b, v26.16b, v0.16b //insert existing bytes in top end of result before storing + + st1 { v9.16b}, [x2] //store all 16B + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + + ins v16.d[0], v8.d[1] //GHASH final block - mid + pmull2 v28.1q, v8.2d, v20.2d //GHASH final block - high + + eor v17.16b, v17.16b, v28.16b //GHASH final block - high + pmull v26.1q, v8.1d, v20.1d //GHASH final block - low + + eor v16.8b, v16.8b, v8.8b //GHASH final block - mid + + pmull v16.1q, v16.1d, v21.1d //GHASH final block - mid + + eor v18.16b, v18.16b, v16.16b //GHASH final block - mid + ldr d16, [x10] //MODULO - load modulo constant + + eor v19.16b, v19.16b, v26.16b //GHASH final block - low + ext v21.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment + + rev32 v30.16b, v30.16b + + str q30, [x16] //store the updated counter +.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up + + pmull v29.1q, v17.1d, v16.1d //MODULO - top 64b align with mid + +.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid + + pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low + ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment + +.inst 0xce115673 //eor3 v19.16b, v19.16b, v17.16b, v21.16b //MODULO - fold into low + ext v19.16b, v19.16b, v19.16b, #8 + rev64 v19.16b, v19.16b + st1 { v19.16b }, [x3] + + mov x0, x9 //return sizes + + ldp d10, d11, [sp, #16] + ldp d12, d13, [sp, #32] + ldp d14, d15, [sp, #48] + ldp d8, d9, [sp], #80 + ret + +.L192_enc_ret: + mov w0, #0x0 + ret +.size unroll8_eor3_aes_gcm_enc_192_kernel,.-unroll8_eor3_aes_gcm_enc_192_kernel +.globl unroll8_eor3_aes_gcm_dec_192_kernel +.type unroll8_eor3_aes_gcm_dec_192_kernel,%function +.align 4 +unroll8_eor3_aes_gcm_dec_192_kernel: + AARCH64_VALID_CALL_TARGET + cbz x1, .L192_dec_ret + stp d8, d9, [sp, #-80]! + lsr x9, x1, #3 + mov x16, x4 + mov x8, x5 + stp d10, d11, [sp, #16] + stp d12, d13, [sp, #32] + stp d14, d15, [sp, #48] + mov x5, #0xc200000000000000 + stp x5, xzr, [sp, #64] + add x10, sp, #64 + + mov x5, x9 + ld1 { v0.16b}, [x16] //CTR block 0 + ld1 { v19.16b}, [x3] + + mov x15, #0x100000000 //set up counter increment + movi v31.16b, #0x0 + mov v31.d[1], x15 + + rev32 v30.16b, v0.16b //set up reversed counter + + add v30.4s, v30.4s, v31.4s //CTR block 0 + + rev32 v1.16b, v30.16b //CTR block 1 + add v30.4s, v30.4s, v31.4s //CTR block 1 + + rev32 v2.16b, v30.16b //CTR block 2 + add v30.4s, v30.4s, v31.4s //CTR block 2 + + rev32 v3.16b, v30.16b //CTR block 3 + add v30.4s, v30.4s, v31.4s //CTR block 3 + + rev32 v4.16b, v30.16b //CTR block 4 + add v30.4s, v30.4s, v31.4s //CTR block 4 + + rev32 v5.16b, v30.16b //CTR block 5 + add v30.4s, v30.4s, v31.4s //CTR block 5 + ldp q26, q27, [x8, #0] //load rk0, rk1 + + rev32 v6.16b, v30.16b //CTR block 6 + add v30.4s, v30.4s, v31.4s //CTR block 6 + + rev32 v7.16b, v30.16b //CTR block 7 + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 3 - round 0 + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 6 - round 0 + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 5 - round 0 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 0 - round 0 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 1 - round 0 + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 7 - round 0 + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 2 - round 0 + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 4 - round 0 + ldp q28, q26, [x8, #32] //load rk2, rk3 + + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 1 - round 1 + + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 2 - round 1 + + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 0 - round 1 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 3 - round 1 + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 7 - round 1 + + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 5 - round 1 + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 6 - round 1 + + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 7 - round 2 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 0 - round 2 + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 4 - round 1 + + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 5 - round 2 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 1 - round 2 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 2 - round 2 + + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 3 - round 2 + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 4 - round 2 + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 6 - round 2 + + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 7 - round 3 + + ldp q27, q28, [x8, #64] //load rk4, rk5 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 2 - round 3 + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 5 - round 3 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 0 - round 3 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 3 - round 3 + + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 4 - round 3 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 1 - round 3 + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 6 - round 3 + + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 3 - round 4 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 2 - round 4 + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 5 - round 4 + + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 1 - round 4 + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 7 - round 4 + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 6 - round 4 + + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 0 - round 4 + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 5 - round 5 + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 4 - round 4 + + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 6 - round 5 + ldp q26, q27, [x8, #96] //load rk6, rk7 + + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 0 - round 5 + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 4 - round 5 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 1 - round 5 + + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 3 - round 5 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 2 - round 5 + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 7 - round 5 + + sub x5, x5, #1 //byte_len - 1 + + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 4 - round 6 + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 5 - round 6 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 1 - round 6 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 0 - round 6 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 3 - round 6 + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 6 - round 6 + + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 7 - round 6 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 2 - round 6 + ldp q28, q26, [x8, #128] //load rk8, rk9 + + add v30.4s, v30.4s, v31.4s //CTR block 7 + + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 3 - round 7 + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 7 - round 7 + + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 2 - round 7 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 1 - round 7 + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 4 - round 7 + + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 6 - round 7 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 0 - round 7 + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 5 - round 7 + + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 1 - round 8 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 2 - round 8 + and x5, x5, #0xffffffffffffff80 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail) + + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 7 - round 8 + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 6 - round 8 + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 5 - round 8 + + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 4 - round 8 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 3 - round 8 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 0 - round 8 + + add x4, x0, x1, lsr #3 //end_input_ptr + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 6 - round 9 + + ld1 { v19.16b}, [x3] + ext v19.16b, v19.16b, v19.16b, #8 + rev64 v19.16b, v19.16b + + ldp q27, q28, [x8, #160] //load rk10, rk11 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 0 - round 9 + add x5, x5, x0 + + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 1 - round 9 + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 7 - round 9 + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 4 - round 9 + + cmp x0, x5 //check if we have <= 8 blocks + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 3 - round 9 + + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 5 - round 9 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 2 - round 9 + + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 3 - round 10 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 1 - round 10 + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 7 - round 10 + + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 4 - round 10 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 0 - round 10 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 2 - round 10 + + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 6 - round 10 + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 5 - round 10 + ldr q26, [x8, #192] //load rk12 + + aese v0.16b, v28.16b //AES block 0 - round 11 + aese v1.16b, v28.16b //AES block 1 - round 11 + aese v4.16b, v28.16b //AES block 4 - round 11 + + aese v6.16b, v28.16b //AES block 6 - round 11 + aese v5.16b, v28.16b //AES block 5 - round 11 + aese v7.16b, v28.16b //AES block 7 - round 11 + + aese v2.16b, v28.16b //AES block 2 - round 11 + aese v3.16b, v28.16b //AES block 3 - round 11 + b.ge .L192_dec_tail //handle tail + + ldp q8, q9, [x0], #32 //AES block 0, 1 - load ciphertext + + ldp q10, q11, [x0], #32 //AES block 2, 3 - load ciphertext + + ldp q12, q13, [x0], #32 //AES block 4, 5 - load ciphertext + +.inst 0xce016921 //eor3 v1.16b, v9.16b, v1.16b, v26.16b //AES block 1 - result +.inst 0xce006900 //eor3 v0.16b, v8.16b, v0.16b, v26.16b //AES block 0 - result + stp q0, q1, [x2], #32 //AES block 0, 1 - store result + + rev32 v0.16b, v30.16b //CTR block 8 + add v30.4s, v30.4s, v31.4s //CTR block 8 + + rev32 v1.16b, v30.16b //CTR block 9 + add v30.4s, v30.4s, v31.4s //CTR block 9 +.inst 0xce036963 //eor3 v3.16b, v11.16b, v3.16b, v26.16b //AES block 3 - result + +.inst 0xce026942 //eor3 v2.16b, v10.16b, v2.16b, v26.16b //AES block 2 - result + stp q2, q3, [x2], #32 //AES block 2, 3 - store result + ldp q14, q15, [x0], #32 //AES block 6, 7 - load ciphertext + + rev32 v2.16b, v30.16b //CTR block 10 + add v30.4s, v30.4s, v31.4s //CTR block 10 + +.inst 0xce046984 //eor3 v4.16b, v12.16b, v4.16b, v26.16b //AES block 4 - result + + rev32 v3.16b, v30.16b //CTR block 11 + add v30.4s, v30.4s, v31.4s //CTR block 11 + +.inst 0xce0569a5 //eor3 v5.16b, v13.16b, v5.16b, v26.16b //AES block 5 - result + stp q4, q5, [x2], #32 //AES block 4, 5 - store result + cmp x0, x5 //check if we have <= 8 blocks + +.inst 0xce0669c6 //eor3 v6.16b, v14.16b, v6.16b, v26.16b //AES block 6 - result +.inst 0xce0769e7 //eor3 v7.16b, v15.16b, v7.16b, v26.16b //AES block 7 - result + rev32 v4.16b, v30.16b //CTR block 12 + + add v30.4s, v30.4s, v31.4s //CTR block 12 + stp q6, q7, [x2], #32 //AES block 6, 7 - store result + b.ge .L192_dec_prepretail //do prepretail + +.L192_dec_main_loop: //main loop start + rev64 v9.16b, v9.16b //GHASH block 8k+1 + ldp q26, q27, [x8, #0] //load rk0, rk1 + ext v19.16b, v19.16b, v19.16b, #8 //PRE 0 + + rev64 v8.16b, v8.16b //GHASH block 8k + rev32 v5.16b, v30.16b //CTR block 8k+13 + add v30.4s, v30.4s, v31.4s //CTR block 8k+13 + + ldr q23, [x3, #176] //load h7l | h7h + ext v23.16b, v23.16b, v23.16b, #8 + ldr q25, [x3, #208] //load h8l | h8h + ext v25.16b, v25.16b, v25.16b, #8 + rev64 v12.16b, v12.16b //GHASH block 8k+4 + rev64 v11.16b, v11.16b //GHASH block 8k+3 + + eor v8.16b, v8.16b, v19.16b //PRE 1 + rev32 v6.16b, v30.16b //CTR block 8k+14 + add v30.4s, v30.4s, v31.4s //CTR block 8k+14 + + rev64 v13.16b, v13.16b //GHASH block 8k+5 + + rev32 v7.16b, v30.16b //CTR block 8k+15 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 0 + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 0 + + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 0 + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 0 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 0 + + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 0 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 0 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 0 + + pmull v19.1q, v8.1d, v25.1d //GHASH block 8k - low + pmull2 v16.1q, v9.2d, v23.2d //GHASH block 8k+1 - high + ldp q28, q26, [x8, #32] //load rk2, rk3 + + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 1 + pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low + ldr q20, [x3, #128] //load h5l | h5h + ext v20.16b, v20.16b, v20.16b, #8 + ldr q22, [x3, #160] //load h6l | h6h + ext v22.16b, v22.16b, v22.16b, #8 + + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 1 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 1 + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 1 + + pmull2 v17.1q, v8.2d, v25.2d //GHASH block 8k - high + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 1 + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 1 + + trn1 v18.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid + rev64 v10.16b, v10.16b //GHASH block 8k+2 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 1 + + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 1 + ldr q21, [x3, #144] //load h6k | h5k + ldr q24, [x3, #192] //load h8k | h7k + trn2 v8.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid + + eor v17.16b, v17.16b, v16.16b //GHASH block 8k+1 - high + pmull2 v9.1q, v11.2d, v20.2d //GHASH block 8k+3 - high + pmull2 v29.1q, v10.2d, v22.2d //GHASH block 8k+2 - high + + eor v8.16b, v8.16b, v18.16b //GHASH block 8k, 8k+1 - mid + eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 2 + + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 2 + pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low +.inst 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b, v9.16b //GHASH block 8k+2, 8k+3 - high + + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 2 + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 3 + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 2 + + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 2 + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 2 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 2 + + ldr q23, [x3, #80] //load h3l | h3h + ext v23.16b, v23.16b, v23.16b, #8 + ldr q25, [x3, #112] //load h4l | h4h + ext v25.16b, v25.16b, v25.16b, #8 + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 2 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 3 + + pmull v22.1q, v10.1d, v22.1d //GHASH block 8k+2 - low + trn1 v29.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid + trn2 v10.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 3 + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 3 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 3 + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 3 + ldp q27, q28, [x8, #64] //load rk4, rk5 + + eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid +.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 3 + + trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid + add v30.4s, v30.4s, v31.4s //CTR block 8k+15 + + pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid + pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid + pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid + + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 3 + pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid + pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high + + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 4 + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 4 + eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid + + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 4 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 4 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 4 + + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 4 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 4 + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 4 + + ldr q20, [x3, #32] //load h1l | h1h + ext v20.16b, v20.16b, v20.16b, #8 + ldr q22, [x3, #64] //load h2l | h2h + ext v22.16b, v22.16b, v22.16b, #8 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 5 + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 5 + + ldp q26, q27, [x8, #96] //load rk6, rk7 + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 5 + rev64 v15.16b, v15.16b //GHASH block 8k+7 + + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 5 +.inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 5 + + pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low + trn2 v12.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 5 + + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 5 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 5 + rev64 v14.16b, v14.16b //GHASH block 8k+6 + + ldr q21, [x3, #48] //load h2k | h1k + ldr q24, [x3, #96] //load h4k | h3k + pmull2 v10.1q, v13.2d, v23.2d //GHASH block 8k+5 - high + pmull v23.1q, v13.1d, v23.1d //GHASH block 8k+5 - low + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 6 + eor v12.16b, v12.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid + trn1 v13.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid + + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 6 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 6 + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 6 + + pmull2 v11.1q, v14.2d, v22.2d //GHASH block 8k+6 - high + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 6 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 6 + + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 7 + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 7 + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 6 + + pmull2 v16.1q, v12.2d, v24.2d //GHASH block 8k+4 - mid +.inst 0xce082a31 //eor3 v17.16b, v17.16b, v8.16b, v10.16b //GHASH block 8k+4, 8k+5 - high +.inst 0xce195e73 //eor3 v19.16b, v19.16b, v25.16b, v23.16b //GHASH block 8k+4, 8k+5 - low + + pmull v22.1q, v14.1d, v22.1d //GHASH block 8k+6 - low + trn2 v14.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 6 + + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 7 + ldp q28, q26, [x8, #128] //load rk8, rk9 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 7 + + eor v14.16b, v14.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid + pmull v24.1q, v12.1d, v24.1d //GHASH block 8k+5 - mid + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 7 + + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 7 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 7 + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 7 + +.inst 0xce184252 //eor3 v18.16b, v18.16b, v24.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid + pmull2 v13.1q, v14.2d, v21.2d //GHASH block 8k+6 - mid + pmull2 v12.1q, v15.2d, v20.2d //GHASH block 8k+7 - high + + pmull v21.1q, v14.1d, v21.1d //GHASH block 8k+7 - mid + ldr d16, [x10] //MODULO - load modulo constant + pmull v20.1q, v15.1d, v20.1d //GHASH block 8k+7 - low + + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 8 + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 8 + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 8 + + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 8 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 8 +.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+6, 8k+7 - low + + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 8 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 8 + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 8 + +.inst 0xce0b3231 //eor3 v17.16b, v17.16b, v11.16b, v12.16b //GHASH block 8k+6, 8k+7 - high + rev32 v20.16b, v30.16b //CTR block 8k+16 + add v30.4s, v30.4s, v31.4s //CTR block 8k+16 + + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 9 +.inst 0xce153652 //eor3 v18.16b, v18.16b, v21.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 9 + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 9 + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 9 + ldp q27, q28, [x8, #160] //load rk10, rk11 + +.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up + ldp q8, q9, [x0], #32 //AES block 8k+8, 8k+9 - load ciphertext + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 9 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 9 + ldp q10, q11, [x0], #32 //AES block 8k+10, 8k+11 - load ciphertext + + rev32 v22.16b, v30.16b //CTR block 8k+17 + pmull v29.1q, v17.1d, v16.1d //MODULO - top 64b align with mid + add v30.4s, v30.4s, v31.4s //CTR block 8k+17 + + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 9 + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 9 + ext v21.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment + + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 10 + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 10 + ldp q12, q13, [x0], #32 //AES block 8k+12, 8k+13 - load ciphertext + + rev32 v23.16b, v30.16b //CTR block 8k+18 + add v30.4s, v30.4s, v31.4s //CTR block 8k+18 +.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid + + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 10 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 10 + ldr q26, [x8, #192] //load rk12 + + ldp q14, q15, [x0], #32 //AES block 8k+14, 8k+15 - load ciphertext + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 10 + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 10 + + aese v0.16b, v28.16b //AES block 8k+8 - round 11 + ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment + aese v1.16b, v28.16b //AES block 8k+9 - round 11 + + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 10 + aese v6.16b, v28.16b //AES block 8k+14 - round 11 + aese v3.16b, v28.16b //AES block 8k+11 - round 11 + +.inst 0xce006900 //eor3 v0.16b, v8.16b, v0.16b, v26.16b //AES block 8k+8 - result + rev32 v25.16b, v30.16b //CTR block 8k+19 + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 10 + + aese v4.16b, v28.16b //AES block 8k+12 - round 11 + aese v2.16b, v28.16b //AES block 8k+10 - round 11 + add v30.4s, v30.4s, v31.4s //CTR block 8k+19 + + aese v7.16b, v28.16b //AES block 8k+15 - round 11 + aese v5.16b, v28.16b //AES block 8k+13 - round 11 + pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low + +.inst 0xce016921 //eor3 v1.16b, v9.16b, v1.16b, v26.16b //AES block 8k+9 - result + stp q0, q1, [x2], #32 //AES block 8k+8, 8k+9 - store result +.inst 0xce036963 //eor3 v3.16b, v11.16b, v3.16b, v26.16b //AES block 8k+11 - result + +.inst 0xce026942 //eor3 v2.16b, v10.16b, v2.16b, v26.16b //AES block 8k+10 - result +.inst 0xce0769e7 //eor3 v7.16b, v15.16b, v7.16b, v26.16b //AES block 8k+15 - result + stp q2, q3, [x2], #32 //AES block 8k+10, 8k+11 - store result + +.inst 0xce0569a5 //eor3 v5.16b, v13.16b, v5.16b, v26.16b //AES block 8k+13 - result +.inst 0xce115673 //eor3 v19.16b, v19.16b, v17.16b, v21.16b //MODULO - fold into low + mov v3.16b, v25.16b //CTR block 8k+19 + +.inst 0xce046984 //eor3 v4.16b, v12.16b, v4.16b, v26.16b //AES block 8k+12 - result + stp q4, q5, [x2], #32 //AES block 8k+12, 8k+13 - store result + cmp x0, x5 //.LOOP CONTROL + +.inst 0xce0669c6 //eor3 v6.16b, v14.16b, v6.16b, v26.16b //AES block 8k+14 - result + stp q6, q7, [x2], #32 //AES block 8k+14, 8k+15 - store result + mov v0.16b, v20.16b //CTR block 8k+16 + + mov v1.16b, v22.16b //CTR block 8k+17 + mov v2.16b, v23.16b //CTR block 8k+18 + + rev32 v4.16b, v30.16b //CTR block 8k+20 + add v30.4s, v30.4s, v31.4s //CTR block 8k+20 + b.lt .L192_dec_main_loop + +.L192_dec_prepretail: //PREPRETAIL + ldp q26, q27, [x8, #0] //load rk0, rk1 + rev32 v5.16b, v30.16b //CTR block 8k+13 + add v30.4s, v30.4s, v31.4s //CTR block 8k+13 + + ldr q23, [x3, #176] //load h7l | h7h + ext v23.16b, v23.16b, v23.16b, #8 + ldr q25, [x3, #208] //load h8l | h8h + ext v25.16b, v25.16b, v25.16b, #8 + rev64 v8.16b, v8.16b //GHASH block 8k + ext v19.16b, v19.16b, v19.16b, #8 //PRE 0 + + rev64 v11.16b, v11.16b //GHASH block 8k+3 + rev32 v6.16b, v30.16b //CTR block 8k+14 + add v30.4s, v30.4s, v31.4s //CTR block 8k+14 + + eor v8.16b, v8.16b, v19.16b //PRE 1 + rev64 v10.16b, v10.16b //GHASH block 8k+2 + rev64 v9.16b, v9.16b //GHASH block 8k+1 + + ldr q20, [x3, #128] //load h5l | h5h + ext v20.16b, v20.16b, v20.16b, #8 + ldr q22, [x3, #160] //load h6l | h6h + ext v22.16b, v22.16b, v22.16b, #8 + rev32 v7.16b, v30.16b //CTR block 8k+15 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 0 + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 0 + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 0 + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 0 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 0 + pmull2 v16.1q, v9.2d, v23.2d //GHASH block 8k+1 - high + + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 0 + pmull2 v17.1q, v8.2d, v25.2d //GHASH block 8k - high + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 0 + + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 1 + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 0 + ldp q28, q26, [x8, #32] //load rk2, rk3 + + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 1 + pmull2 v29.1q, v10.2d, v22.2d //GHASH block 8k+2 - high + pmull v22.1q, v10.1d, v22.1d //GHASH block 8k+2 - low + + pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low + eor v17.16b, v17.16b, v16.16b //GHASH block 8k+1 - high + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 1 + + pmull v19.1q, v8.1d, v25.1d //GHASH block 8k - low + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 1 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 1 + + trn1 v18.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid + trn2 v8.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid + pmull2 v9.1q, v11.2d, v20.2d //GHASH block 8k+3 - high + + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 1 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 1 + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 1 + + ldr q21, [x3, #144] //load h6k | h5k + ldr q24, [x3, #192] //load h8k | h7k + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 2 + eor v8.16b, v8.16b, v18.16b //GHASH block 8k, 8k+1 - mid + + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 2 + rev64 v13.16b, v13.16b //GHASH block 8k+5 + pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low + +.inst 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b, v9.16b //GHASH block 8k+2, 8k+3 - high + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 2 + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 2 + + trn1 v29.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 3 + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 2 + + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 2 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 2 + trn2 v10.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid + + pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 2 + pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid + + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 3 + eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid + eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low + + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 3 + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 3 + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 3 + +.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low + ldp q27, q28, [x8, #64] //load rk4, rk5 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 3 + + ldr q23, [x3, #80] //load h3l | h3h + ext v23.16b, v23.16b, v23.16b, #8 + ldr q25, [x3, #112] //load h4l | h4h + ext v25.16b, v25.16b, v25.16b, #8 + pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid + pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid + + ldr q20, [x3, #32] //load h1l | h1h + ext v20.16b, v20.16b, v20.16b, #8 + ldr q22, [x3, #64] //load h2l | h2h + ext v22.16b, v22.16b, v22.16b, #8 + eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 3 + + rev64 v15.16b, v15.16b //GHASH block 8k+7 + +.inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid + rev64 v12.16b, v12.16b //GHASH block 8k+4 + + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 4 + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 4 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 3 + + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 4 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 4 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 4 + + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 4 + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 4 + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 4 + + rev64 v14.16b, v14.16b //GHASH block 8k+6 + ldr q21, [x3, #48] //load h2k | h1k + ldr q24, [x3, #96] //load h4k | h3k + trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid + + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 5 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 5 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 5 + + ldp q26, q27, [x8, #96] //load rk6, rk7 + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 5 + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 5 + + pmull2 v11.1q, v14.2d, v22.2d //GHASH block 8k+6 - high + pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high + pmull v22.1q, v14.1d, v22.1d //GHASH block 8k+6 - low + + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 5 + + pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low + trn2 v12.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid + pmull2 v10.1q, v13.2d, v23.2d //GHASH block 8k+5 - high + + pmull v23.1q, v13.1d, v23.1d //GHASH block 8k+5 - low + trn1 v13.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 5 + + trn2 v14.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 5 + eor v12.16b, v12.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid + + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 6 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 6 + + eor v14.16b, v14.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 6 + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 6 + + pmull2 v16.1q, v12.2d, v24.2d //GHASH block 8k+4 - mid + pmull v24.1q, v12.1d, v24.1d //GHASH block 8k+5 - mid + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 6 + + pmull2 v13.1q, v14.2d, v21.2d //GHASH block 8k+6 - mid + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 6 + pmull2 v12.1q, v15.2d, v20.2d //GHASH block 8k+7 - high + +.inst 0xce184252 //eor3 v18.16b, v18.16b, v24.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 7 +.inst 0xce195e73 //eor3 v19.16b, v19.16b, v25.16b, v23.16b //GHASH block 8k+4, 8k+5 - low + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 6 + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 6 + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 7 + + ldp q28, q26, [x8, #128] //load rk8, rk9 + pmull v21.1q, v14.1d, v21.1d //GHASH block 8k+7 - mid + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 7 + + ldr d16, [x10] //MODULO - load modulo constant +.inst 0xce082a31 //eor3 v17.16b, v17.16b, v8.16b, v10.16b //GHASH block 8k+4, 8k+5 - high + pmull v20.1q, v15.1d, v20.1d //GHASH block 8k+7 - low + + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 7 + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 7 + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 7 + +.inst 0xce0b3231 //eor3 v17.16b, v17.16b, v11.16b, v12.16b //GHASH block 8k+6, 8k+7 - high +.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+6, 8k+7 - low +.inst 0xce153652 //eor3 v18.16b, v18.16b, v21.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid + + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 7 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 7 + +.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up + ext v21.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 8 + + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 8 + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 8 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 8 + + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 8 + pmull v29.1q, v17.1d, v16.1d //MODULO - top 64b align with mid + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 8 + + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 8 + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 8 + ldp q27, q28, [x8, #160] //load rk10, rk11 + +.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 9 + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 9 + + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 9 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 9 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 9 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 9 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 9 + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 9 + + pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low + ldr q26, [x8, #192] //load rk12 + ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment + + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 10 + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 10 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 10 + + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 10 + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 10 + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 10 + + aese v0.16b, v28.16b //AES block 8k+8 - round 11 +.inst 0xce115673 //eor3 v19.16b, v19.16b, v17.16b, v21.16b //MODULO - fold into low + aese v5.16b, v28.16b //AES block 8k+13 - round 11 + + aese v2.16b, v28.16b //AES block 8k+10 - round 11 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 10 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 10 + + aese v6.16b, v28.16b //AES block 8k+14 - round 11 + aese v4.16b, v28.16b //AES block 8k+12 - round 11 + add v30.4s, v30.4s, v31.4s //CTR block 8k+15 + + aese v3.16b, v28.16b //AES block 8k+11 - round 11 + aese v1.16b, v28.16b //AES block 8k+9 - round 11 + aese v7.16b, v28.16b //AES block 8k+15 - round 11 + +.L192_dec_tail: //TAIL + + sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process + + ldp q20, q21, [x3, #128] //load h5l | h5h + ext v20.16b, v20.16b, v20.16b, #8 + ldr q9, [x0], #16 //AES block 8k+8 - load ciphertext + + ldp q24, q25, [x3, #192] //load h8k | h7k + ext v25.16b, v25.16b, v25.16b, #8 + + mov v29.16b, v26.16b + + ldp q22, q23, [x3, #160] //load h6l | h6h + ext v22.16b, v22.16b, v22.16b, #8 + ext v23.16b, v23.16b, v23.16b, #8 + ext v16.16b, v19.16b, v19.16b, #8 //prepare final partial tag + +.inst 0xce00752c //eor3 v12.16b, v9.16b, v0.16b, v29.16b //AES block 8k+8 - result + cmp x5, #112 + b.gt .L192_dec_blocks_more_than_7 + + mov v7.16b, v6.16b + movi v17.8b, #0 + sub v30.4s, v30.4s, v31.4s + + mov v6.16b, v5.16b + mov v5.16b, v4.16b + mov v4.16b, v3.16b + + cmp x5, #96 + movi v19.8b, #0 + mov v3.16b, v2.16b + + mov v2.16b, v1.16b + movi v18.8b, #0 + b.gt .L192_dec_blocks_more_than_6 + + mov v7.16b, v6.16b + mov v6.16b, v5.16b + mov v5.16b, v4.16b + + mov v4.16b, v3.16b + mov v3.16b, v1.16b + + sub v30.4s, v30.4s, v31.4s + cmp x5, #80 + b.gt .L192_dec_blocks_more_than_5 + + mov v7.16b, v6.16b + mov v6.16b, v5.16b + + mov v5.16b, v4.16b + mov v4.16b, v1.16b + cmp x5, #64 + + sub v30.4s, v30.4s, v31.4s + b.gt .L192_dec_blocks_more_than_4 + + sub v30.4s, v30.4s, v31.4s + mov v7.16b, v6.16b + mov v6.16b, v5.16b + + mov v5.16b, v1.16b + cmp x5, #48 + b.gt .L192_dec_blocks_more_than_3 + + sub v30.4s, v30.4s, v31.4s + mov v7.16b, v6.16b + cmp x5, #32 + + mov v6.16b, v1.16b + ldr q24, [x3, #96] //load h4k | h3k + b.gt .L192_dec_blocks_more_than_2 + + sub v30.4s, v30.4s, v31.4s + + mov v7.16b, v1.16b + cmp x5, #16 + b.gt .L192_dec_blocks_more_than_1 + + sub v30.4s, v30.4s, v31.4s + ldr q21, [x3, #48] //load h2k | h1k + b .L192_dec_blocks_less_than_1 +.L192_dec_blocks_more_than_7: //blocks left > 7 + rev64 v8.16b, v9.16b //GHASH final-7 block + + ins v18.d[0], v24.d[1] //GHASH final-7 block - mid + eor v8.16b, v8.16b, v16.16b //feed in partial tag + + pmull2 v17.1q, v8.2d, v25.2d //GHASH final-7 block - high + ins v27.d[0], v8.d[1] //GHASH final-7 block - mid + ldr q9, [x0], #16 //AES final-6 block - load ciphertext + + pmull v19.1q, v8.1d, v25.1d //GHASH final-7 block - low + + eor v27.8b, v27.8b, v8.8b //GHASH final-7 block - mid + st1 { v12.16b}, [x2], #16 //AES final-7 block - store result + +.inst 0xce01752c //eor3 v12.16b, v9.16b, v1.16b, v29.16b //AES final-6 block - result + + pmull v18.1q, v27.1d, v18.1d //GHASH final-7 block - mid + movi v16.8b, #0 //suppress further partial tag feed in +.L192_dec_blocks_more_than_6: //blocks left > 6 + + rev64 v8.16b, v9.16b //GHASH final-6 block + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + + ldr q9, [x0], #16 //AES final-5 block - load ciphertext + ins v27.d[0], v8.d[1] //GHASH final-6 block - mid + + eor v27.8b, v27.8b, v8.8b //GHASH final-6 block - mid + movi v16.8b, #0 //suppress further partial tag feed in + pmull2 v28.1q, v8.2d, v23.2d //GHASH final-6 block - high + + st1 { v12.16b}, [x2], #16 //AES final-6 block - store result +.inst 0xce02752c //eor3 v12.16b, v9.16b, v2.16b, v29.16b //AES final-5 block - result + + eor v17.16b, v17.16b, v28.16b //GHASH final-6 block - high + pmull v27.1q, v27.1d, v24.1d //GHASH final-6 block - mid + pmull v26.1q, v8.1d, v23.1d //GHASH final-6 block - low + + eor v18.16b, v18.16b, v27.16b //GHASH final-6 block - mid + eor v19.16b, v19.16b, v26.16b //GHASH final-6 block - low +.L192_dec_blocks_more_than_5: //blocks left > 5 + + rev64 v8.16b, v9.16b //GHASH final-5 block + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + + ins v27.d[0], v8.d[1] //GHASH final-5 block - mid + + eor v27.8b, v27.8b, v8.8b //GHASH final-5 block - mid + + ins v27.d[1], v27.d[0] //GHASH final-5 block - mid + pmull2 v28.1q, v8.2d, v22.2d //GHASH final-5 block - high + + ldr q9, [x0], #16 //AES final-4 block - load ciphertext + + eor v17.16b, v17.16b, v28.16b //GHASH final-5 block - high + pmull v26.1q, v8.1d, v22.1d //GHASH final-5 block - low + + pmull2 v27.1q, v27.2d, v21.2d //GHASH final-5 block - mid + + eor v19.16b, v19.16b, v26.16b //GHASH final-5 block - low + movi v16.8b, #0 //suppress further partial tag feed in + st1 { v12.16b}, [x2], #16 //AES final-5 block - store result + + eor v18.16b, v18.16b, v27.16b //GHASH final-5 block - mid +.inst 0xce03752c //eor3 v12.16b, v9.16b, v3.16b, v29.16b //AES final-4 block - result +.L192_dec_blocks_more_than_4: //blocks left > 4 + + rev64 v8.16b, v9.16b //GHASH final-4 block + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + movi v16.8b, #0 //suppress further partial tag feed in + + ldr q9, [x0], #16 //AES final-3 block - load ciphertext + ins v27.d[0], v8.d[1] //GHASH final-4 block - mid + pmull v26.1q, v8.1d, v20.1d //GHASH final-4 block - low + + eor v27.8b, v27.8b, v8.8b //GHASH final-4 block - mid + + eor v19.16b, v19.16b, v26.16b //GHASH final-4 block - low + + pmull v27.1q, v27.1d, v21.1d //GHASH final-4 block - mid + st1 { v12.16b}, [x2], #16 //AES final-4 block - store result + pmull2 v28.1q, v8.2d, v20.2d //GHASH final-4 block - high + +.inst 0xce04752c //eor3 v12.16b, v9.16b, v4.16b, v29.16b //AES final-3 block - result + + eor v18.16b, v18.16b, v27.16b //GHASH final-4 block - mid + eor v17.16b, v17.16b, v28.16b //GHASH final-4 block - high +.L192_dec_blocks_more_than_3: //blocks left > 3 + + ldr q25, [x3, #112] //load h4l | h4h + ext v25.16b, v25.16b, v25.16b, #8 + rev64 v8.16b, v9.16b //GHASH final-3 block + ldr q9, [x0], #16 //AES final-2 block - load ciphertext + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + + ins v27.d[0], v8.d[1] //GHASH final-3 block - mid + pmull2 v28.1q, v8.2d, v25.2d //GHASH final-3 block - high + + eor v17.16b, v17.16b, v28.16b //GHASH final-3 block - high + movi v16.8b, #0 //suppress further partial tag feed in + pmull v26.1q, v8.1d, v25.1d //GHASH final-3 block - low + + st1 { v12.16b}, [x2], #16 //AES final-3 block - store result + eor v27.8b, v27.8b, v8.8b //GHASH final-3 block - mid +.inst 0xce05752c //eor3 v12.16b, v9.16b, v5.16b, v29.16b //AES final-2 block - result + + eor v19.16b, v19.16b, v26.16b //GHASH final-3 block - low + ldr q24, [x3, #96] //load h4k | h3k + + ins v27.d[1], v27.d[0] //GHASH final-3 block - mid + + pmull2 v27.1q, v27.2d, v24.2d //GHASH final-3 block - mid + + eor v18.16b, v18.16b, v27.16b //GHASH final-3 block - mid +.L192_dec_blocks_more_than_2: //blocks left > 2 + + rev64 v8.16b, v9.16b //GHASH final-2 block + ldr q23, [x3, #80] //load h3l | h3h + ext v23.16b, v23.16b, v23.16b, #8 + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + + ins v27.d[0], v8.d[1] //GHASH final-2 block - mid + ldr q9, [x0], #16 //AES final-1 block - load ciphertext + + pmull2 v28.1q, v8.2d, v23.2d //GHASH final-2 block - high + + eor v27.8b, v27.8b, v8.8b //GHASH final-2 block - mid + + eor v17.16b, v17.16b, v28.16b //GHASH final-2 block - high + pmull v26.1q, v8.1d, v23.1d //GHASH final-2 block - low + + pmull v27.1q, v27.1d, v24.1d //GHASH final-2 block - mid + movi v16.8b, #0 //suppress further partial tag feed in + + eor v19.16b, v19.16b, v26.16b //GHASH final-2 block - low + st1 { v12.16b}, [x2], #16 //AES final-2 block - store result + + eor v18.16b, v18.16b, v27.16b //GHASH final-2 block - mid +.inst 0xce06752c //eor3 v12.16b, v9.16b, v6.16b, v29.16b //AES final-1 block - result +.L192_dec_blocks_more_than_1: //blocks left > 1 + + rev64 v8.16b, v9.16b //GHASH final-1 block + ldr q9, [x0], #16 //AES final block - load ciphertext + ldr q22, [x3, #64] //load h1l | h1h + ext v22.16b, v22.16b, v22.16b, #8 + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + movi v16.8b, #0 //suppress further partial tag feed in + ldr q21, [x3, #48] //load h2k | h1k + + pmull v26.1q, v8.1d, v22.1d //GHASH final-1 block - low + ins v27.d[0], v8.d[1] //GHASH final-1 block - mid + st1 { v12.16b}, [x2], #16 //AES final-1 block - store result + + pmull2 v28.1q, v8.2d, v22.2d //GHASH final-1 block - high + +.inst 0xce07752c //eor3 v12.16b, v9.16b, v7.16b, v29.16b //AES final block - result + + eor v27.8b, v27.8b, v8.8b //GHASH final-1 block - mid + + ins v27.d[1], v27.d[0] //GHASH final-1 block - mid + + pmull2 v27.1q, v27.2d, v21.2d //GHASH final-1 block - mid + + eor v19.16b, v19.16b, v26.16b //GHASH final-1 block - low + + eor v18.16b, v18.16b, v27.16b //GHASH final-1 block - mid + eor v17.16b, v17.16b, v28.16b //GHASH final-1 block - high +.L192_dec_blocks_less_than_1: //blocks left <= 1 + + rev32 v30.16b, v30.16b + and x1, x1, #127 //bit_length %= 128 + + sub x1, x1, #128 //bit_length -= 128 + str q30, [x16] //store the updated counter + + neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128]) + mvn x6, xzr //temp0_x = 0xffffffffffffffff + + and x1, x1, #127 //bit_length %= 128 + + mvn x7, xzr //temp1_x = 0xffffffffffffffff + lsr x6, x6, x1 //temp0_x is mask for top 64b of last block + cmp x1, #64 + + csel x13, x7, x6, lt + csel x14, x6, xzr, lt + ldr q20, [x3, #32] //load h1l | h1h + ext v20.16b, v20.16b, v20.16b, #8 + + mov v0.d[1], x14 + ld1 { v26.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored + + mov v0.d[0], x13 //ctr0b is mask for last block + + and v9.16b, v9.16b, v0.16b //possibly partial last block has zeroes in highest bits + bif v12.16b, v26.16b, v0.16b //insert existing bytes in top end of result before storing + + rev64 v8.16b, v9.16b //GHASH final block + + st1 { v12.16b}, [x2] //store all 16B + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + + ins v16.d[0], v8.d[1] //GHASH final block - mid + pmull v26.1q, v8.1d, v20.1d //GHASH final block - low + + eor v16.8b, v16.8b, v8.8b //GHASH final block - mid + pmull2 v28.1q, v8.2d, v20.2d //GHASH final block - high + eor v19.16b, v19.16b, v26.16b //GHASH final block - low + + pmull v16.1q, v16.1d, v21.1d //GHASH final block - mid + eor v17.16b, v17.16b, v28.16b //GHASH final block - high + + eor v14.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up + eor v18.16b, v18.16b, v16.16b //GHASH final block - mid + ldr d16, [x10] //MODULO - load modulo constant + + pmull v21.1q, v17.1d, v16.1d //MODULO - top 64b align with mid + ext v17.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment + + eor v18.16b, v18.16b, v14.16b //MODULO - karatsuba tidy up + +.inst 0xce115652 //eor3 v18.16b, v18.16b, v17.16b, v21.16b //MODULO - fold into mid + + pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low + ext v18.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment + +.inst 0xce124673 //eor3 v19.16b, v19.16b, v18.16b, v17.16b //MODULO - fold into low + ext v19.16b, v19.16b, v19.16b, #8 + rev64 v19.16b, v19.16b + st1 { v19.16b }, [x3] + + mov x0, x9 + + ldp d10, d11, [sp, #16] + ldp d12, d13, [sp, #32] + ldp d14, d15, [sp, #48] + ldp d8, d9, [sp], #80 + ret + +.L192_dec_ret: + mov w0, #0x0 + ret +.size unroll8_eor3_aes_gcm_dec_192_kernel,.-unroll8_eor3_aes_gcm_dec_192_kernel +.globl unroll8_eor3_aes_gcm_enc_256_kernel +.type unroll8_eor3_aes_gcm_enc_256_kernel,%function +.align 4 +unroll8_eor3_aes_gcm_enc_256_kernel: + AARCH64_VALID_CALL_TARGET + cbz x1, .L256_enc_ret + stp d8, d9, [sp, #-80]! + lsr x9, x1, #3 + mov x16, x4 + mov x8, x5 + stp d10, d11, [sp, #16] + stp d12, d13, [sp, #32] + stp d14, d15, [sp, #48] + mov x5, #0xc200000000000000 + stp x5, xzr, [sp, #64] + add x10, sp, #64 + + ld1 { v0.16b}, [x16] //CTR block 0 + + mov x5, x9 + + mov x15, #0x100000000 //set up counter increment + movi v31.16b, #0x0 + mov v31.d[1], x15 + sub x5, x5, #1 //byte_len - 1 + + and x5, x5, #0xffffffffffffff80 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail) + + add x5, x5, x0 + + rev32 v30.16b, v0.16b //set up reversed counter + + add v30.4s, v30.4s, v31.4s //CTR block 0 + + rev32 v1.16b, v30.16b //CTR block 1 + add v30.4s, v30.4s, v31.4s //CTR block 1 + + rev32 v2.16b, v30.16b //CTR block 2 + add v30.4s, v30.4s, v31.4s //CTR block 2 + + rev32 v3.16b, v30.16b //CTR block 3 + add v30.4s, v30.4s, v31.4s //CTR block 3 + + rev32 v4.16b, v30.16b //CTR block 4 + add v30.4s, v30.4s, v31.4s //CTR block 4 + + rev32 v5.16b, v30.16b //CTR block 5 + add v30.4s, v30.4s, v31.4s //CTR block 5 + ldp q26, q27, [x8, #0] //load rk0, rk1 + + rev32 v6.16b, v30.16b //CTR block 6 + add v30.4s, v30.4s, v31.4s //CTR block 6 + + rev32 v7.16b, v30.16b //CTR block 7 + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 3 - round 0 + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 4 - round 0 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 2 - round 0 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 0 - round 0 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 1 - round 0 + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 6 - round 0 + + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 5 - round 0 + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 7 - round 0 + ldp q28, q26, [x8, #32] //load rk2, rk3 + + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 4 - round 1 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 1 - round 1 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 3 - round 1 + + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 6 - round 1 + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 5 - round 1 + + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 2 - round 1 + + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 7 - round 1 + + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 2 - round 2 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 3 - round 2 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 0 - round 1 + + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 7 - round 2 + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 6 - round 2 + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 5 - round 2 + + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 4 - round 2 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 0 - round 2 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 1 - round 2 + + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 5 - round 3 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 3 - round 3 + ldp q27, q28, [x8, #64] //load rk4, rk5 + + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 4 - round 3 + + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 1 - round 3 + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 6 - round 3 + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 7 - round 3 + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 2 - round 3 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 0 - round 3 + + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 4 - round 4 + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 6 - round 4 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 1 - round 4 + + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 2 - round 4 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 0 - round 4 + + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 3 - round 4 + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 7 - round 4 + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 5 - round 4 + + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 0 - round 5 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 2 - round 5 + ldp q26, q27, [x8, #96] //load rk6, rk7 + + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 1 - round 5 + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 4 - round 5 + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 5 - round 5 + + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 3 - round 5 + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 6 - round 5 + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 7 - round 5 + + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 1 - round 6 + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 5 - round 6 + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 4 - round 6 + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 2 - round 6 + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 6 - round 6 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 0 - round 6 + + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 7 - round 6 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 3 - round 6 + ldp q28, q26, [x8, #128] //load rk8, rk9 + + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 2 - round 7 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 0 - round 7 + + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 7 - round 7 + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 6 - round 7 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 1 - round 7 + + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 5 - round 7 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 3 - round 7 + + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 4 - round 7 + + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 6 - round 8 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 1 - round 8 + + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 3 - round 8 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 0 - round 8 + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 7 - round 8 + + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 5 - round 8 + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 4 - round 8 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 2 - round 8 + + ld1 { v19.16b}, [x3] + ext v19.16b, v19.16b, v19.16b, #8 + rev64 v19.16b, v19.16b + ldp q27, q28, [x8, #160] //load rk10, rk11 + + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 6 - round 9 + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 7 - round 9 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 3 - round 9 + + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 4 - round 9 + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 5 - round 9 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 2 - round 9 + + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 1 - round 9 + + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 7 - round 10 + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 4 - round 10 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 0 - round 9 + + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 1 - round 10 + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 5 - round 10 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 3 - round 10 + + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 2 - round 10 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 0 - round 10 + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 6 - round 10 + + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 4 - round 11 + ldp q26, q27, [x8, #192] //load rk12, rk13 + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 5 - round 11 + + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 2 - round 11 + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 6 - round 11 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 1 - round 11 + + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 0 - round 11 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 3 - round 11 + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 7 - round 11 + + add v30.4s, v30.4s, v31.4s //CTR block 7 + ldr q28, [x8, #224] //load rk14 + + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 4 - round 12 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 2 - round 12 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 1 - round 12 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 0 - round 12 + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 5 - round 12 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 3 - round 12 + + aese v2.16b, v27.16b //AES block 2 - round 13 + aese v1.16b, v27.16b //AES block 1 - round 13 + aese v4.16b, v27.16b //AES block 4 - round 13 + + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 6 - round 12 + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 7 - round 12 + + aese v0.16b, v27.16b //AES block 0 - round 13 + aese v5.16b, v27.16b //AES block 5 - round 13 + + aese v6.16b, v27.16b //AES block 6 - round 13 + aese v7.16b, v27.16b //AES block 7 - round 13 + aese v3.16b, v27.16b //AES block 3 - round 13 + + add x4, x0, x1, lsr #3 //end_input_ptr + cmp x0, x5 //check if we have <= 8 blocks + b.ge .L256_enc_tail //handle tail + + ldp q8, q9, [x0], #32 //AES block 0, 1 - load plaintext + + ldp q10, q11, [x0], #32 //AES block 2, 3 - load plaintext + +.inst 0xce007108 //eor3 v8.16b, v8.16b, v0.16b, v28.16b //AES block 0 - result + rev32 v0.16b, v30.16b //CTR block 8 + add v30.4s, v30.4s, v31.4s //CTR block 8 + +.inst 0xce017129 //eor3 v9.16b, v9.16b, v1.16b, v28.16b //AES block 1 - result +.inst 0xce03716b //eor3 v11.16b, v11.16b, v3.16b, v28.16b //AES block 3 - result + + rev32 v1.16b, v30.16b //CTR block 9 + add v30.4s, v30.4s, v31.4s //CTR block 9 + ldp q12, q13, [x0], #32 //AES block 4, 5 - load plaintext + + ldp q14, q15, [x0], #32 //AES block 6, 7 - load plaintext +.inst 0xce02714a //eor3 v10.16b, v10.16b, v2.16b, v28.16b //AES block 2 - result + cmp x0, x5 //check if we have <= 8 blocks + + rev32 v2.16b, v30.16b //CTR block 10 + add v30.4s, v30.4s, v31.4s //CTR block 10 + stp q8, q9, [x2], #32 //AES block 0, 1 - store result + + stp q10, q11, [x2], #32 //AES block 2, 3 - store result + + rev32 v3.16b, v30.16b //CTR block 11 + add v30.4s, v30.4s, v31.4s //CTR block 11 + +.inst 0xce04718c //eor3 v12.16b, v12.16b, v4.16b, v28.16b //AES block 4 - result + +.inst 0xce0771ef //eor3 v15.16b, v15.16b, v7.16b, v28.16b //AES block 7 - result +.inst 0xce0671ce //eor3 v14.16b, v14.16b, v6.16b, v28.16b //AES block 6 - result +.inst 0xce0571ad //eor3 v13.16b, v13.16b, v5.16b, v28.16b //AES block 5 - result + + stp q12, q13, [x2], #32 //AES block 4, 5 - store result + rev32 v4.16b, v30.16b //CTR block 12 + + stp q14, q15, [x2], #32 //AES block 6, 7 - store result + add v30.4s, v30.4s, v31.4s //CTR block 12 + b.ge .L256_enc_prepretail //do prepretail + +.L256_enc_main_loop: //main loop start + ldp q26, q27, [x8, #0] //load rk0, rk1 + + rev32 v5.16b, v30.16b //CTR block 8k+13 + add v30.4s, v30.4s, v31.4s //CTR block 8k+13 + ldr q21, [x3, #144] //load h6k | h5k + ldr q24, [x3, #192] //load h8k | h7k + + rev64 v11.16b, v11.16b //GHASH block 8k+3 + ldr q20, [x3, #128] //load h5l | h5h + ext v20.16b, v20.16b, v20.16b, #8 + ldr q22, [x3, #160] //load h6l | h6h + ext v22.16b, v22.16b, v22.16b, #8 + rev64 v9.16b, v9.16b //GHASH block 8k+1 + + rev32 v6.16b, v30.16b //CTR block 8k+14 + add v30.4s, v30.4s, v31.4s //CTR block 8k+14 + rev64 v8.16b, v8.16b //GHASH block 8k + + rev64 v12.16b, v12.16b //GHASH block 8k+4 + ext v19.16b, v19.16b, v19.16b, #8 //PRE 0 + ldr q23, [x3, #176] //load h7l | h7h + ext v23.16b, v23.16b, v23.16b, #8 + ldr q25, [x3, #208] //load h8l | h8h + ext v25.16b, v25.16b, v25.16b, #8 + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 0 + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 0 + rev32 v7.16b, v30.16b //CTR block 8k+15 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 0 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 0 + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 0 + + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 0 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 0 + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 0 + + ldp q28, q26, [x8, #32] //load rk2, rk3 + eor v8.16b, v8.16b, v19.16b //PRE 1 + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 1 + + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 1 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 1 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 1 + + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 1 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 1 + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 1 + + pmull2 v17.1q, v8.2d, v25.2d //GHASH block 8k - high + pmull v19.1q, v8.1d, v25.1d //GHASH block 8k - low + pmull2 v16.1q, v9.2d, v23.2d //GHASH block 8k+1 - high + + trn1 v18.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid + trn2 v8.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 1 + + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 2 + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 2 + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 2 + + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 2 + pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 2 + + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 3 + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 3 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 2 + + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 3 + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 2 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 2 + + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 3 + rev64 v14.16b, v14.16b //GHASH block 8k+6 + pmull2 v9.1q, v11.2d, v20.2d //GHASH block 8k+3 - high + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 3 + ldp q27, q28, [x8, #64] //load rk4, rk5 + rev64 v10.16b, v10.16b //GHASH block 8k+2 + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 3 + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 3 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 3 + + eor v17.16b, v17.16b, v16.16b //GHASH block 8k+1 - high + pmull2 v29.1q, v10.2d, v22.2d //GHASH block 8k+2 - high + rev64 v13.16b, v13.16b //GHASH block 8k+5 + + pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low + eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low + ldr q23, [x3, #80] //load h3l | h3h + ext v23.16b, v23.16b, v23.16b, #8 + ldr q25, [x3, #112] //load h4l | h4h + ext v25.16b, v25.16b, v25.16b, #8 + + trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid +.inst 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b, v9.16b //GHASH block 8k+2, 8k+3 - high + pmull v22.1q, v10.1d, v22.1d //GHASH block 8k+2 - low + + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 4 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 4 + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 4 + + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 4 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 4 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 4 + + trn1 v29.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 4 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 4 + + trn2 v10.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid + eor v8.16b, v8.16b, v18.16b //GHASH block 8k, 8k+1 - mid + ldp q26, q27, [x8, #96] //load rk6, rk7 + + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 5 + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 5 + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 5 + + eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 5 + rev64 v15.16b, v15.16b //GHASH block 8k+7 + + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 5 + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 5 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 5 + + pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid + pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 5 + + pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 6 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 6 + + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 6 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 6 + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 6 + + eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid + pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 6 + +.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 6 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 6 + + ldp q28, q26, [x8, #128] //load rk8, rk9 + pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 7 + + ldr q20, [x3, #32] //load h1l | h1h + ext v20.16b, v20.16b, v20.16b, #8 + ldr q22, [x3, #64] //load h2l | h2h + ext v22.16b, v22.16b, v22.16b, #8 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 7 +.inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid + + ldr q21, [x3, #48] //load h2k | h1k + ldr q24, [x3, #96] //load h4k | h3k + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 7 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 7 + + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 7 + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 7 + pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low + + trn2 v12.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 7 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 7 + + pmull2 v10.1q, v13.2d, v23.2d //GHASH block 8k+5 - high + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 8 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 8 + + pmull v23.1q, v13.1d, v23.1d //GHASH block 8k+5 - low + trn1 v13.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid + eor v12.16b, v12.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid + + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 8 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 9 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 8 + + pmull2 v16.1q, v12.2d, v24.2d //GHASH block 8k+4 - mid + pmull v24.1q, v12.1d, v24.1d //GHASH block 8k+5 - mid + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 8 + + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 8 + pmull2 v11.1q, v14.2d, v22.2d //GHASH block 8k+6 - high + pmull v22.1q, v14.1d, v22.1d //GHASH block 8k+6 - low + + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 8 + trn2 v14.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 8 + +.inst 0xce184252 //eor3 v18.16b, v18.16b, v24.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 9 + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 9 + + eor v14.16b, v14.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 9 + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 9 + + ldp q27, q28, [x8, #160] //load rk10, rk11 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 9 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 9 + + pmull2 v12.1q, v15.2d, v20.2d //GHASH block 8k+7 - high +.inst 0xce195e73 //eor3 v19.16b, v19.16b, v25.16b, v23.16b //GHASH block 8k+4, 8k+5 - low + pmull v20.1q, v15.1d, v20.1d //GHASH block 8k+7 - low + + ldr d16, [x10] //MODULO - load modulo constant + pmull2 v13.1q, v14.2d, v21.2d //GHASH block 8k+6 - mid + pmull v21.1q, v14.1d, v21.1d //GHASH block 8k+7 - mid + + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 9 + +.inst 0xce153652 //eor3 v18.16b, v18.16b, v21.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid +.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+6, 8k+7 - low +.inst 0xce082a31 //eor3 v17.16b, v17.16b, v8.16b, v10.16b //GHASH block 8k+4, 8k+5 - high + + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 10 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 10 + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 10 + + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 10 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 10 + add v30.4s, v30.4s, v31.4s //CTR block 8k+15 + + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 10 + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 10 + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 10 + +.inst 0xce0b3231 //eor3 v17.16b, v17.16b, v11.16b, v12.16b //GHASH block 8k+6, 8k+7 - high + + ldp q26, q27, [x8, #192] //load rk12, rk13 + rev32 v20.16b, v30.16b //CTR block 8k+16 + + ext v21.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment + ldp q8, q9, [x0], #32 //AES block 8k+8, 8k+9 - load plaintext + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 11 + + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 11 + add v30.4s, v30.4s, v31.4s //CTR block 8k+16 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 11 + + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 11 + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 11 + + pmull v29.1q, v17.1d, v16.1d //MODULO - top 64b align with mid + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 11 + + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 12 + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 11 + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 12 + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 12 + rev32 v22.16b, v30.16b //CTR block 8k+17 + + add v30.4s, v30.4s, v31.4s //CTR block 8k+17 + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 11 +.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up + + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 12 + ldr q28, [x8, #224] //load rk14 + aese v7.16b, v27.16b //AES block 8k+15 - round 13 + + ldp q10, q11, [x0], #32 //AES block 8k+10, 8k+11 - load plaintext + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 12 + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 12 + +.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 12 + ldp q12, q13, [x0], #32 //AES block 4, 5 - load plaintext + + ldp q14, q15, [x0], #32 //AES block 6, 7 - load plaintext + aese v2.16b, v27.16b //AES block 8k+10 - round 13 + aese v4.16b, v27.16b //AES block 8k+12 - round 13 + + rev32 v23.16b, v30.16b //CTR block 8k+18 + add v30.4s, v30.4s, v31.4s //CTR block 8k+18 + aese v5.16b, v27.16b //AES block 8k+13 - round 13 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 12 + aese v3.16b, v27.16b //AES block 8k+11 - round 13 + cmp x0, x5 //.LOOP CONTROL + +.inst 0xce02714a //eor3 v10.16b, v10.16b, v2.16b, v28.16b //AES block 8k+10 - result + rev32 v25.16b, v30.16b //CTR block 8k+19 + add v30.4s, v30.4s, v31.4s //CTR block 8k+19 + + aese v0.16b, v27.16b //AES block 8k+8 - round 13 + aese v6.16b, v27.16b //AES block 8k+14 - round 13 +.inst 0xce0571ad //eor3 v13.16b, v13.16b, v5.16b, v28.16b //AES block 5 - result + + ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment + pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low + aese v1.16b, v27.16b //AES block 8k+9 - round 13 + +.inst 0xce04718c //eor3 v12.16b, v12.16b, v4.16b, v28.16b //AES block 4 - result + rev32 v4.16b, v30.16b //CTR block 8k+20 +.inst 0xce03716b //eor3 v11.16b, v11.16b, v3.16b, v28.16b //AES block 8k+11 - result + + mov v3.16b, v25.16b //CTR block 8k+19 +.inst 0xce017129 //eor3 v9.16b, v9.16b, v1.16b, v28.16b //AES block 8k+9 - result +.inst 0xce007108 //eor3 v8.16b, v8.16b, v0.16b, v28.16b //AES block 8k+8 - result + + add v30.4s, v30.4s, v31.4s //CTR block 8k+20 + stp q8, q9, [x2], #32 //AES block 8k+8, 8k+9 - store result + mov v2.16b, v23.16b //CTR block 8k+18 + +.inst 0xce0771ef //eor3 v15.16b, v15.16b, v7.16b, v28.16b //AES block 7 - result +.inst 0xce154673 //eor3 v19.16b, v19.16b, v21.16b, v17.16b //MODULO - fold into low + stp q10, q11, [x2], #32 //AES block 8k+10, 8k+11 - store result + +.inst 0xce0671ce //eor3 v14.16b, v14.16b, v6.16b, v28.16b //AES block 6 - result + mov v1.16b, v22.16b //CTR block 8k+17 + stp q12, q13, [x2], #32 //AES block 4, 5 - store result + + stp q14, q15, [x2], #32 //AES block 6, 7 - store result + mov v0.16b, v20.16b //CTR block 8k+16 + b.lt .L256_enc_main_loop + +.L256_enc_prepretail: //PREPRETAIL + rev32 v5.16b, v30.16b //CTR block 8k+13 + ldp q26, q27, [x8, #0] //load rk0, rk1 + add v30.4s, v30.4s, v31.4s //CTR block 8k+13 + + rev64 v10.16b, v10.16b //GHASH block 8k+2 + + rev32 v6.16b, v30.16b //CTR block 8k+14 + add v30.4s, v30.4s, v31.4s //CTR block 8k+14 + + rev64 v13.16b, v13.16b //GHASH block 8k+5 + ldr q21, [x3, #144] //load h6k | h5k + ldr q24, [x3, #192] //load h8k | h7k + + rev32 v7.16b, v30.16b //CTR block 8k+15 + + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 0 + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 0 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 0 + + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 0 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 0 + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 0 + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 0 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 0 + + ext v19.16b, v19.16b, v19.16b, #8 //PRE 0 + rev64 v8.16b, v8.16b //GHASH block 8k + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 1 + + rev64 v9.16b, v9.16b //GHASH block 8k+1 + ldp q28, q26, [x8, #32] //load rk2, rk3 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 1 + + ldr q23, [x3, #176] //load h7l | h7h + ext v23.16b, v23.16b, v23.16b, #8 + ldr q25, [x3, #208] //load h8l | h8h + ext v25.16b, v25.16b, v25.16b, #8 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 1 + + ldr q20, [x3, #128] //load h5l | h5h + ext v20.16b, v20.16b, v20.16b, #8 + ldr q22, [x3, #160] //load h6l | h6h + ext v22.16b, v22.16b, v22.16b, #8 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 1 + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 1 + + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 1 + eor v8.16b, v8.16b, v19.16b //PRE 1 + + rev64 v11.16b, v11.16b //GHASH block 8k+3 + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 1 + + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 2 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 2 + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 1 + + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 2 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 2 + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 2 + + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 2 + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 2 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 2 + + ldp q27, q28, [x8, #64] //load rk4, rk5 + trn1 v18.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid + pmull2 v17.1q, v8.2d, v25.2d //GHASH block 8k - high + + rev64 v14.16b, v14.16b //GHASH block 8k+6 + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 3 + pmull2 v16.1q, v9.2d, v23.2d //GHASH block 8k+1 - high + + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 3 + pmull v19.1q, v8.1d, v25.1d //GHASH block 8k - low + trn2 v8.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid + + pmull2 v29.1q, v10.2d, v22.2d //GHASH block 8k+2 - high + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 3 + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 3 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 3 + eor v17.16b, v17.16b, v16.16b //GHASH block 8k+1 - high + + pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low + pmull2 v9.1q, v11.2d, v20.2d //GHASH block 8k+3 - high + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 3 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 3 + eor v8.16b, v8.16b, v18.16b //GHASH block 8k, 8k+1 - mid + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 3 + + pmull v22.1q, v10.1d, v22.1d //GHASH block 8k+2 - low + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 4 + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 4 + + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 4 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 4 + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 4 + + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 5 + pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid +.inst 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b, v9.16b //GHASH block 8k+2, 8k+3 - high + + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 4 + trn1 v29.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid + trn2 v10.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid + + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 4 + eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 4 + + pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low + pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid + eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid + + rev64 v12.16b, v12.16b //GHASH block 8k+4 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 5 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 5 + + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 5 + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 5 + ldp q26, q27, [x8, #96] //load rk6, rk7 + + ldr q23, [x3, #80] //load h3l | h3h + ext v23.16b, v23.16b, v23.16b, #8 + ldr q25, [x3, #112] //load h4l | h4h + ext v25.16b, v25.16b, v25.16b, #8 + pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid + pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid + +.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low + eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid + + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 5 + rev64 v15.16b, v15.16b //GHASH block 8k+7 + trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid + + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 5 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 5 +.inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid + + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 6 + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 6 + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 6 + + ldr q21, [x3, #48] //load h2k | h1k + ldr q24, [x3, #96] //load h4k | h3k + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 6 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 6 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 6 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 6 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 6 + + pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high + pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low + ldr q20, [x3, #32] //load h1l | h1h + ext v20.16b, v20.16b, v20.16b, #8 + ldr q22, [x3, #64] //load h2l | h2h + ext v22.16b, v22.16b, v22.16b, #8 + + ldp q28, q26, [x8, #128] //load rk8, rk9 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 7 + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 7 + + pmull2 v10.1q, v13.2d, v23.2d //GHASH block 8k+5 - high + trn2 v12.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid + + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 7 + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 7 + pmull v23.1q, v13.1d, v23.1d //GHASH block 8k+5 - low + + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 7 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 7 + eor v12.16b, v12.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid + + pmull2 v11.1q, v14.2d, v22.2d //GHASH block 8k+6 - high + pmull v22.1q, v14.1d, v22.1d //GHASH block 8k+6 - low + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 7 + + trn1 v13.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid + trn2 v14.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 7 + + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 8 +.inst 0xce195e73 //eor3 v19.16b, v19.16b, v25.16b, v23.16b //GHASH block 8k+4, 8k+5 - low + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 8 + + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 8 + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 8 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 8 + + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 8 + eor v14.16b, v14.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 8 + + pmull2 v16.1q, v12.2d, v24.2d //GHASH block 8k+4 - mid + pmull v24.1q, v12.1d, v24.1d //GHASH block 8k+5 - mid + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 8 + + pmull2 v12.1q, v15.2d, v20.2d //GHASH block 8k+7 - high + pmull2 v13.1q, v14.2d, v21.2d //GHASH block 8k+6 - mid + pmull v21.1q, v14.1d, v21.1d //GHASH block 8k+7 - mid + + pmull v20.1q, v15.1d, v20.1d //GHASH block 8k+7 - low +.inst 0xce184252 //eor3 v18.16b, v18.16b, v24.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid +.inst 0xce082a31 //eor3 v17.16b, v17.16b, v8.16b, v10.16b //GHASH block 8k+4, 8k+5 - high + + ldp q27, q28, [x8, #160] //load rk10, rk11 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 9 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 9 + +.inst 0xce0b3231 //eor3 v17.16b, v17.16b, v11.16b, v12.16b //GHASH block 8k+6, 8k+7 - high +.inst 0xce153652 //eor3 v18.16b, v18.16b, v21.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid + ldr d16, [x10] //MODULO - load modulo constant + +.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+6, 8k+7 - low + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 9 + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 9 + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 9 + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 9 + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 9 + + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 10 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 10 + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 9 + + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 10 + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 10 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 10 + + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 10 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 10 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 10 + + pmull v29.1q, v17.1d, v16.1d //MODULO - top 64b align with mid +.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 11 + + ldp q26, q27, [x8, #192] //load rk12, rk13 + ext v21.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 11 + +.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 11 + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 11 + + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 11 + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 11 + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 11 + + pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 11 + ldr q28, [x8, #224] //load rk14 + + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 12 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 12 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 12 + + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 12 + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 12 + ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment + + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 12 + add v30.4s, v30.4s, v31.4s //CTR block 8k+15 + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 12 + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 12 + aese v0.16b, v27.16b //AES block 8k+8 - round 13 + +.inst 0xce154673 //eor3 v19.16b, v19.16b, v21.16b, v17.16b //MODULO - fold into low + aese v5.16b, v27.16b //AES block 8k+13 - round 13 + aese v1.16b, v27.16b //AES block 8k+9 - round 13 + + aese v3.16b, v27.16b //AES block 8k+11 - round 13 + aese v4.16b, v27.16b //AES block 8k+12 - round 13 + aese v7.16b, v27.16b //AES block 8k+15 - round 13 + + aese v2.16b, v27.16b //AES block 8k+10 - round 13 + aese v6.16b, v27.16b //AES block 8k+14 - round 13 +.L256_enc_tail: //TAIL + + ldp q24, q25, [x3, #192] //load h8l | h8h + ext v25.16b, v25.16b, v25.16b, #8 + sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process + + ldr q8, [x0], #16 //AES block 8k+8 - load plaintext + + ldp q20, q21, [x3, #128] //load h5l | h5h + ext v20.16b, v20.16b, v20.16b, #8 + + ext v16.16b, v19.16b, v19.16b, #8 //prepare final partial tag + ldp q22, q23, [x3, #160] //load h6l | h6h + ext v22.16b, v22.16b, v22.16b, #8 + ext v23.16b, v23.16b, v23.16b, #8 + mov v29.16b, v28.16b + + cmp x5, #112 +.inst 0xce007509 //eor3 v9.16b, v8.16b, v0.16b, v29.16b //AES block 8k+8 - result + b.gt .L256_enc_blocks_more_than_7 + + movi v19.8b, #0 + mov v7.16b, v6.16b + movi v17.8b, #0 + + mov v6.16b, v5.16b + mov v5.16b, v4.16b + mov v4.16b, v3.16b + + mov v3.16b, v2.16b + sub v30.4s, v30.4s, v31.4s + mov v2.16b, v1.16b + + movi v18.8b, #0 + cmp x5, #96 + b.gt .L256_enc_blocks_more_than_6 + + mov v7.16b, v6.16b + mov v6.16b, v5.16b + cmp x5, #80 + + mov v5.16b, v4.16b + mov v4.16b, v3.16b + mov v3.16b, v1.16b + + sub v30.4s, v30.4s, v31.4s + b.gt .L256_enc_blocks_more_than_5 + + mov v7.16b, v6.16b + sub v30.4s, v30.4s, v31.4s + + mov v6.16b, v5.16b + mov v5.16b, v4.16b + + cmp x5, #64 + mov v4.16b, v1.16b + b.gt .L256_enc_blocks_more_than_4 + + cmp x5, #48 + mov v7.16b, v6.16b + mov v6.16b, v5.16b + + mov v5.16b, v1.16b + sub v30.4s, v30.4s, v31.4s + b.gt .L256_enc_blocks_more_than_3 + + cmp x5, #32 + mov v7.16b, v6.16b + ldr q24, [x3, #96] //load h4k | h3k + + mov v6.16b, v1.16b + sub v30.4s, v30.4s, v31.4s + b.gt .L256_enc_blocks_more_than_2 + + mov v7.16b, v1.16b + + sub v30.4s, v30.4s, v31.4s + cmp x5, #16 + b.gt .L256_enc_blocks_more_than_1 + + sub v30.4s, v30.4s, v31.4s + ldr q21, [x3, #48] //load h2k | h1k + b .L256_enc_blocks_less_than_1 +.L256_enc_blocks_more_than_7: //blocks left > 7 + st1 { v9.16b}, [x2], #16 //AES final-7 block - store result + + rev64 v8.16b, v9.16b //GHASH final-7 block + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + + ldr q9, [x0], #16 //AES final-6 block - load plaintext + + pmull2 v17.1q, v8.2d, v25.2d //GHASH final-7 block - high + ins v27.d[0], v8.d[1] //GHASH final-7 block - mid + ins v18.d[0], v24.d[1] //GHASH final-7 block - mid + + movi v16.8b, #0 //suppress further partial tag feed in + + eor v27.8b, v27.8b, v8.8b //GHASH final-7 block - mid +.inst 0xce017529 //eor3 v9.16b, v9.16b, v1.16b, v29.16b //AES final-6 block - result + + pmull v18.1q, v27.1d, v18.1d //GHASH final-7 block - mid + pmull v19.1q, v8.1d, v25.1d //GHASH final-7 block - low +.L256_enc_blocks_more_than_6: //blocks left > 6 + + st1 { v9.16b}, [x2], #16 //AES final-6 block - store result + + rev64 v8.16b, v9.16b //GHASH final-6 block + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + + pmull v26.1q, v8.1d, v23.1d //GHASH final-6 block - low + ins v27.d[0], v8.d[1] //GHASH final-6 block - mid + pmull2 v28.1q, v8.2d, v23.2d //GHASH final-6 block - high + + ldr q9, [x0], #16 //AES final-5 block - load plaintext + + eor v19.16b, v19.16b, v26.16b //GHASH final-6 block - low + + eor v27.8b, v27.8b, v8.8b //GHASH final-6 block - mid + + pmull v27.1q, v27.1d, v24.1d //GHASH final-6 block - mid +.inst 0xce027529 //eor3 v9.16b, v9.16b, v2.16b, v29.16b //AES final-5 block - result + + movi v16.8b, #0 //suppress further partial tag feed in + + eor v18.16b, v18.16b, v27.16b //GHASH final-6 block - mid + eor v17.16b, v17.16b, v28.16b //GHASH final-6 block - high +.L256_enc_blocks_more_than_5: //blocks left > 5 + + st1 { v9.16b}, [x2], #16 //AES final-5 block - store result + + rev64 v8.16b, v9.16b //GHASH final-5 block + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + + ins v27.d[0], v8.d[1] //GHASH final-5 block - mid + + pmull2 v28.1q, v8.2d, v22.2d //GHASH final-5 block - high + + eor v17.16b, v17.16b, v28.16b //GHASH final-5 block - high + eor v27.8b, v27.8b, v8.8b //GHASH final-5 block - mid + + ins v27.d[1], v27.d[0] //GHASH final-5 block - mid + + ldr q9, [x0], #16 //AES final-4 block - load plaintext + pmull v26.1q, v8.1d, v22.1d //GHASH final-5 block - low + + pmull2 v27.1q, v27.2d, v21.2d //GHASH final-5 block - mid + movi v16.8b, #0 //suppress further partial tag feed in + eor v19.16b, v19.16b, v26.16b //GHASH final-5 block - low + + eor v18.16b, v18.16b, v27.16b //GHASH final-5 block - mid +.inst 0xce037529 //eor3 v9.16b, v9.16b, v3.16b, v29.16b //AES final-4 block - result +.L256_enc_blocks_more_than_4: //blocks left > 4 + + st1 { v9.16b}, [x2], #16 //AES final-4 block - store result + + rev64 v8.16b, v9.16b //GHASH final-4 block + + ldr q9, [x0], #16 //AES final-3 block - load plaintext + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + + ins v27.d[0], v8.d[1] //GHASH final-4 block - mid + pmull2 v28.1q, v8.2d, v20.2d //GHASH final-4 block - high + +.inst 0xce047529 //eor3 v9.16b, v9.16b, v4.16b, v29.16b //AES final-3 block - result + pmull v26.1q, v8.1d, v20.1d //GHASH final-4 block - low + + eor v27.8b, v27.8b, v8.8b //GHASH final-4 block - mid + eor v19.16b, v19.16b, v26.16b //GHASH final-4 block - low + + pmull v27.1q, v27.1d, v21.1d //GHASH final-4 block - mid + + movi v16.8b, #0 //suppress further partial tag feed in + + eor v18.16b, v18.16b, v27.16b //GHASH final-4 block - mid + eor v17.16b, v17.16b, v28.16b //GHASH final-4 block - high +.L256_enc_blocks_more_than_3: //blocks left > 3 + + st1 { v9.16b}, [x2], #16 //AES final-3 block - store result + + ldr q25, [x3, #112] //load h4l | h4h + ext v25.16b, v25.16b, v25.16b, #8 + rev64 v8.16b, v9.16b //GHASH final-3 block + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + + ins v27.d[0], v8.d[1] //GHASH final-3 block - mid + pmull2 v28.1q, v8.2d, v25.2d //GHASH final-3 block - high + + eor v17.16b, v17.16b, v28.16b //GHASH final-3 block - high + eor v27.8b, v27.8b, v8.8b //GHASH final-3 block - mid + ldr q24, [x3, #96] //load h4k | h3k + + ins v27.d[1], v27.d[0] //GHASH final-3 block - mid + ldr q9, [x0], #16 //AES final-2 block - load plaintext + + pmull2 v27.1q, v27.2d, v24.2d //GHASH final-3 block - mid + pmull v26.1q, v8.1d, v25.1d //GHASH final-3 block - low + +.inst 0xce057529 //eor3 v9.16b, v9.16b, v5.16b, v29.16b //AES final-2 block - result + movi v16.8b, #0 //suppress further partial tag feed in + + eor v18.16b, v18.16b, v27.16b //GHASH final-3 block - mid + eor v19.16b, v19.16b, v26.16b //GHASH final-3 block - low +.L256_enc_blocks_more_than_2: //blocks left > 2 + + ldr q23, [x3, #80] //load h3l | h3h + ext v23.16b, v23.16b, v23.16b, #8 + + st1 { v9.16b}, [x2], #16 //AES final-2 block - store result + + rev64 v8.16b, v9.16b //GHASH final-2 block + ldr q9, [x0], #16 //AES final-1 block - load plaintext + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + + ins v27.d[0], v8.d[1] //GHASH final-2 block - mid + + movi v16.8b, #0 //suppress further partial tag feed in + + pmull2 v28.1q, v8.2d, v23.2d //GHASH final-2 block - high +.inst 0xce067529 //eor3 v9.16b, v9.16b, v6.16b, v29.16b //AES final-1 block - result + + eor v27.8b, v27.8b, v8.8b //GHASH final-2 block - mid + + eor v17.16b, v17.16b, v28.16b //GHASH final-2 block - high + + pmull v27.1q, v27.1d, v24.1d //GHASH final-2 block - mid + pmull v26.1q, v8.1d, v23.1d //GHASH final-2 block - low + + eor v18.16b, v18.16b, v27.16b //GHASH final-2 block - mid + eor v19.16b, v19.16b, v26.16b //GHASH final-2 block - low +.L256_enc_blocks_more_than_1: //blocks left > 1 + + st1 { v9.16b}, [x2], #16 //AES final-1 block - store result + + ldr q22, [x3, #64] //load h2l | h2h + ext v22.16b, v22.16b, v22.16b, #8 + rev64 v8.16b, v9.16b //GHASH final-1 block + ldr q9, [x0], #16 //AES final block - load plaintext + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + movi v16.8b, #0 //suppress further partial tag feed in + + ins v27.d[0], v8.d[1] //GHASH final-1 block - mid + pmull2 v28.1q, v8.2d, v22.2d //GHASH final-1 block - high + +.inst 0xce077529 //eor3 v9.16b, v9.16b, v7.16b, v29.16b //AES final block - result + eor v17.16b, v17.16b, v28.16b //GHASH final-1 block - high + + pmull v26.1q, v8.1d, v22.1d //GHASH final-1 block - low + eor v27.8b, v27.8b, v8.8b //GHASH final-1 block - mid + + ldr q21, [x3, #48] //load h2k | h1k + + eor v19.16b, v19.16b, v26.16b //GHASH final-1 block - low + ins v27.d[1], v27.d[0] //GHASH final-1 block - mid + + pmull2 v27.1q, v27.2d, v21.2d //GHASH final-1 block - mid + + eor v18.16b, v18.16b, v27.16b //GHASH final-1 block - mid +.L256_enc_blocks_less_than_1: //blocks left <= 1 + + and x1, x1, #127 //bit_length %= 128 + + sub x1, x1, #128 //bit_length -= 128 + + neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128]) + + mvn x6, xzr //temp0_x = 0xffffffffffffffff + and x1, x1, #127 //bit_length %= 128 + + lsr x6, x6, x1 //temp0_x is mask for top 64b of last block + cmp x1, #64 + mvn x7, xzr //temp1_x = 0xffffffffffffffff + + csel x14, x6, xzr, lt + csel x13, x7, x6, lt + + mov v0.d[0], x13 //ctr0b is mask for last block + ldr q20, [x3, #32] //load h1l | h1h + ext v20.16b, v20.16b, v20.16b, #8 + + ld1 { v26.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored + mov v0.d[1], x14 + + and v9.16b, v9.16b, v0.16b //possibly partial last block has zeroes in highest bits + + rev64 v8.16b, v9.16b //GHASH final block + + rev32 v30.16b, v30.16b + bif v9.16b, v26.16b, v0.16b //insert existing bytes in top end of result before storing + str q30, [x16] //store the updated counter + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + st1 { v9.16b}, [x2] //store all 16B + + ins v16.d[0], v8.d[1] //GHASH final block - mid + pmull2 v28.1q, v8.2d, v20.2d //GHASH final block - high + pmull v26.1q, v8.1d, v20.1d //GHASH final block - low + + eor v17.16b, v17.16b, v28.16b //GHASH final block - high + eor v19.16b, v19.16b, v26.16b //GHASH final block - low + + eor v16.8b, v16.8b, v8.8b //GHASH final block - mid + + pmull v16.1q, v16.1d, v21.1d //GHASH final block - mid + + eor v18.16b, v18.16b, v16.16b //GHASH final block - mid + ldr d16, [x10] //MODULO - load modulo constant + + ext v21.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment + +.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up + pmull v29.1q, v17.1d, v16.1d //MODULO - top 64b align with mid + +.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid + + pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low + ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment + +.inst 0xce115673 //eor3 v19.16b, v19.16b, v17.16b, v21.16b //MODULO - fold into low + ext v19.16b, v19.16b, v19.16b, #8 + rev64 v19.16b, v19.16b + st1 { v19.16b }, [x3] + mov x0, x9 //return sizes + + ldp d10, d11, [sp, #16] + ldp d12, d13, [sp, #32] + ldp d14, d15, [sp, #48] + ldp d8, d9, [sp], #80 + ret + +.L256_enc_ret: + mov w0, #0x0 + ret +.size unroll8_eor3_aes_gcm_enc_256_kernel,.-unroll8_eor3_aes_gcm_enc_256_kernel +.globl unroll8_eor3_aes_gcm_dec_256_kernel +.type unroll8_eor3_aes_gcm_dec_256_kernel,%function +.align 4 +unroll8_eor3_aes_gcm_dec_256_kernel: + AARCH64_VALID_CALL_TARGET + cbz x1, .L256_dec_ret + stp d8, d9, [sp, #-80]! + lsr x9, x1, #3 + mov x16, x4 + mov x8, x5 + stp d10, d11, [sp, #16] + stp d12, d13, [sp, #32] + stp d14, d15, [sp, #48] + mov x5, #0xc200000000000000 + stp x5, xzr, [sp, #64] + add x10, sp, #64 + + ld1 { v0.16b}, [x16] //CTR block 0 + + mov x15, #0x100000000 //set up counter increment + movi v31.16b, #0x0 + mov v31.d[1], x15 + mov x5, x9 + + sub x5, x5, #1 //byte_len - 1 + + rev32 v30.16b, v0.16b //set up reversed counter + + add v30.4s, v30.4s, v31.4s //CTR block 0 + + rev32 v1.16b, v30.16b //CTR block 1 + add v30.4s, v30.4s, v31.4s //CTR block 1 + + rev32 v2.16b, v30.16b //CTR block 2 + add v30.4s, v30.4s, v31.4s //CTR block 2 + ldp q26, q27, [x8, #0] //load rk0, rk1 + + rev32 v3.16b, v30.16b //CTR block 3 + add v30.4s, v30.4s, v31.4s //CTR block 3 + + rev32 v4.16b, v30.16b //CTR block 4 + add v30.4s, v30.4s, v31.4s //CTR block 4 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 0 - round 0 + + rev32 v5.16b, v30.16b //CTR block 5 + add v30.4s, v30.4s, v31.4s //CTR block 5 + + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 1 - round 0 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 2 - round 0 + + rev32 v6.16b, v30.16b //CTR block 6 + add v30.4s, v30.4s, v31.4s //CTR block 6 + + rev32 v7.16b, v30.16b //CTR block 7 + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 4 - round 0 + + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 6 - round 0 + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 5 - round 0 + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 3 - round 0 + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 7 - round 0 + ldp q28, q26, [x8, #32] //load rk2, rk3 + + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 6 - round 1 + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 4 - round 1 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 0 - round 1 + + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 5 - round 1 + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 7 - round 1 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 1 - round 1 + + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 2 - round 1 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 3 - round 1 + + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 3 - round 2 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 2 - round 2 + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 6 - round 2 + + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 1 - round 2 + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 7 - round 2 + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 5 - round 2 + + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 0 - round 2 + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 4 - round 2 + ldp q27, q28, [x8, #64] //load rk4, rk5 + + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 1 - round 3 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 2 - round 3 + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 3 - round 3 + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 4 - round 3 + + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 5 - round 3 + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 7 - round 3 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 0 - round 3 + + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 6 - round 3 + + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 7 - round 4 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 3 - round 4 + + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 6 - round 4 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 2 - round 4 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 0 - round 4 + + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 4 - round 4 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 1 - round 4 + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 5 - round 4 + + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 0 - round 5 + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 6 - round 5 + + ldp q26, q27, [x8, #96] //load rk6, rk7 + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 4 - round 5 + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 7 - round 5 + + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 5 - round 5 + + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 2 - round 5 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 3 - round 5 + + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 1 - round 5 + + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 4 - round 6 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 3 - round 6 + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 7 - round 6 + + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 6 - round 6 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 0 - round 6 + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 5 - round 6 + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 2 - round 6 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 1 - round 6 + ldp q28, q26, [x8, #128] //load rk8, rk9 + + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 5 - round 7 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 0 - round 7 + + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 3 - round 7 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 2 - round 7 + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 7 - round 7 + + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 4 - round 7 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 1 - round 7 + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 6 - round 7 + + and x5, x5, #0xffffffffffffff80 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail) + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 7 - round 8 + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 5 - round 8 + + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 0 - round 8 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 1 - round 8 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 2 - round 8 + + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 4 - round 8 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 3 - round 8 + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 6 - round 8 + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 2 - round 9 + + ld1 { v19.16b}, [x3] + ext v19.16b, v19.16b, v19.16b, #8 + rev64 v19.16b, v19.16b + ldp q27, q28, [x8, #160] //load rk10, rk11 + add x4, x0, x1, lsr #3 //end_input_ptr + add x5, x5, x0 + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 3 - round 9 + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 6 - round 9 + + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 4 - round 9 + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 5 - round 9 + + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 7 - round 9 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 0 - round 9 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 1 - round 9 + + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 4 - round 10 + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 7 - round 10 + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 5 - round 10 + + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 1 - round 10 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 2 - round 10 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 0 - round 10 + + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 6 - round 10 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 3 - round 10 + ldp q26, q27, [x8, #192] //load rk12, rk13 + + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 0 - round 11 + add v30.4s, v30.4s, v31.4s //CTR block 7 + + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 7 - round 11 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 3 - round 11 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 1 - round 11 + + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 5 - round 11 + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 4 - round 11 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 2 - round 11 + + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 6 - round 11 + ldr q28, [x8, #224] //load rk14 + + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 1 - round 12 + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 4 - round 12 + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 5 - round 12 + + cmp x0, x5 //check if we have <= 8 blocks + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 3 - round 12 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 2 - round 12 + + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 6 - round 12 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 0 - round 12 + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 7 - round 12 + + aese v5.16b, v27.16b //AES block 5 - round 13 + aese v1.16b, v27.16b //AES block 1 - round 13 + aese v2.16b, v27.16b //AES block 2 - round 13 + + aese v0.16b, v27.16b //AES block 0 - round 13 + aese v4.16b, v27.16b //AES block 4 - round 13 + aese v6.16b, v27.16b //AES block 6 - round 13 + + aese v3.16b, v27.16b //AES block 3 - round 13 + aese v7.16b, v27.16b //AES block 7 - round 13 + b.ge .L256_dec_tail //handle tail + + ldp q8, q9, [x0], #32 //AES block 0, 1 - load ciphertext + + ldp q10, q11, [x0], #32 //AES block 2, 3 - load ciphertext + + ldp q12, q13, [x0], #32 //AES block 4, 5 - load ciphertext + + ldp q14, q15, [x0], #32 //AES block 6, 7 - load ciphertext + cmp x0, x5 //check if we have <= 8 blocks + +.inst 0xce017121 //eor3 v1.16b, v9.16b, v1.16b, v28.16b //AES block 1 - result +.inst 0xce007100 //eor3 v0.16b, v8.16b, v0.16b, v28.16b //AES block 0 - result + stp q0, q1, [x2], #32 //AES block 0, 1 - store result + + rev32 v0.16b, v30.16b //CTR block 8 + add v30.4s, v30.4s, v31.4s //CTR block 8 +.inst 0xce037163 //eor3 v3.16b, v11.16b, v3.16b, v28.16b //AES block 3 - result + +.inst 0xce0571a5 //eor3 v5.16b, v13.16b, v5.16b, v28.16b //AES block 5 - result + +.inst 0xce047184 //eor3 v4.16b, v12.16b, v4.16b, v28.16b //AES block 4 - result + rev32 v1.16b, v30.16b //CTR block 9 + add v30.4s, v30.4s, v31.4s //CTR block 9 + +.inst 0xce027142 //eor3 v2.16b, v10.16b, v2.16b, v28.16b //AES block 2 - result + stp q2, q3, [x2], #32 //AES block 2, 3 - store result + + rev32 v2.16b, v30.16b //CTR block 10 + add v30.4s, v30.4s, v31.4s //CTR block 10 + +.inst 0xce0671c6 //eor3 v6.16b, v14.16b, v6.16b, v28.16b //AES block 6 - result + + rev32 v3.16b, v30.16b //CTR block 11 + add v30.4s, v30.4s, v31.4s //CTR block 11 + stp q4, q5, [x2], #32 //AES block 4, 5 - store result + +.inst 0xce0771e7 //eor3 v7.16b, v15.16b, v7.16b, v28.16b //AES block 7 - result + stp q6, q7, [x2], #32 //AES block 6, 7 - store result + + rev32 v4.16b, v30.16b //CTR block 12 + add v30.4s, v30.4s, v31.4s //CTR block 12 + b.ge .L256_dec_prepretail //do prepretail + +.L256_dec_main_loop: //main loop start + rev32 v5.16b, v30.16b //CTR block 8k+13 + ldp q26, q27, [x8, #0] //load rk0, rk1 + add v30.4s, v30.4s, v31.4s //CTR block 8k+13 + + rev64 v9.16b, v9.16b //GHASH block 8k+1 + ldr q23, [x3, #176] //load h7l | h7h + ext v23.16b, v23.16b, v23.16b, #8 + ldr q25, [x3, #208] //load h8l | h8h + ext v25.16b, v25.16b, v25.16b, #8 + + rev32 v6.16b, v30.16b //CTR block 8k+14 + add v30.4s, v30.4s, v31.4s //CTR block 8k+14 + rev64 v8.16b, v8.16b //GHASH block 8k + + ext v19.16b, v19.16b, v19.16b, #8 //PRE 0 + rev64 v12.16b, v12.16b //GHASH block 8k+4 + rev64 v11.16b, v11.16b //GHASH block 8k+3 + + rev32 v7.16b, v30.16b //CTR block 8k+15 + rev64 v15.16b, v15.16b //GHASH block 8k+7 + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 0 + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 0 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 0 + + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 0 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 0 + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 0 + + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 0 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 0 + ldp q28, q26, [x8, #32] //load rk2, rk3 + + eor v8.16b, v8.16b, v19.16b //PRE 1 + ldr q20, [x3, #128] //load h5l | h5h + ext v20.16b, v20.16b, v20.16b, #8 + ldr q22, [x3, #160] //load h6l | h6h + ext v22.16b, v22.16b, v22.16b, #8 + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 1 + + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 1 + rev64 v10.16b, v10.16b //GHASH block 8k+2 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 1 + + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 1 + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 1 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 1 + + trn1 v18.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 1 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 1 + + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 2 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 2 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 2 + + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 2 + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 2 + pmull v19.1q, v8.1d, v25.1d //GHASH block 8k - low + + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 2 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 2 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 2 + + ldp q27, q28, [x8, #64] //load rk4, rk5 + pmull2 v29.1q, v10.2d, v22.2d //GHASH block 8k+2 - high + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 3 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 3 + pmull2 v16.1q, v9.2d, v23.2d //GHASH block 8k+1 - high + pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low + + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 3 + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 3 + pmull2 v17.1q, v8.2d, v25.2d //GHASH block 8k - high + + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 3 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 3 + trn2 v8.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid + + pmull2 v9.1q, v11.2d, v20.2d //GHASH block 8k+3 - high + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 3 + eor v17.16b, v17.16b, v16.16b //GHASH block 8k+1 - high + + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 4 + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 3 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 4 + + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 4 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 4 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 4 + + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 4 + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 4 + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 4 + + ldr q21, [x3, #144] //load h6k | h5k + ldr q24, [x3, #192] //load h8k | h7k + eor v8.16b, v8.16b, v18.16b //GHASH block 8k, 8k+1 - mid + pmull v22.1q, v10.1d, v22.1d //GHASH block 8k+2 - low + + ldp q26, q27, [x8, #96] //load rk6, rk7 + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 5 + eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low + + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 5 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 5 + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 5 + + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 5 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 5 + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 5 + +.inst 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b, v9.16b //GHASH block 8k+2, 8k+3 - high + trn1 v29.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid + rev64 v13.16b, v13.16b //GHASH block 8k+5 + + pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid + pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid + trn2 v10.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 6 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 6 + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 5 + + trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 6 + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 6 + + eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid + pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 6 + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 6 + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 6 + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 6 + + pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid + pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid +.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low + + ldr q23, [x3, #80] //load h3l | h3h + ext v23.16b, v23.16b, v23.16b, #8 + ldr q25, [x3, #112] //load h4l | h4h + ext v25.16b, v25.16b, v25.16b, #8 + rev64 v14.16b, v14.16b //GHASH block 8k+6 + eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid + + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 7 + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 7 + ldp q28, q26, [x8, #128] //load rk8, rk9 + + ldr q20, [x3, #32] //load h1l | h1h + ext v20.16b, v20.16b, v20.16b, #8 + ldr q22, [x3, #64] //load h2l | h2h + ext v22.16b, v22.16b, v22.16b, #8 +.inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 7 + + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 7 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 7 + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 7 + + ldr q21, [x3, #48] //load h2k | h1k + ldr q24, [x3, #96] //load h4k | h3k + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 7 + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 7 + + pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high + pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low + trn2 v12.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid + + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 8 + pmull2 v10.1q, v13.2d, v23.2d //GHASH block 8k+5 - high + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 8 + + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 8 + pmull v23.1q, v13.1d, v23.1d //GHASH block 8k+5 - low + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 8 + + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 8 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 8 + pmull2 v11.1q, v14.2d, v22.2d //GHASH block 8k+6 - high + + trn1 v13.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 8 + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 8 + + ldp q27, q28, [x8, #160] //load rk10, rk11 + pmull v22.1q, v14.1d, v22.1d //GHASH block 8k+6 - low + trn2 v14.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid + + add v30.4s, v30.4s, v31.4s //CTR block 8k+15 +.inst 0xce082a31 //eor3 v17.16b, v17.16b, v8.16b, v10.16b //GHASH block 8k+4, 8k+5 - high + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 9 + + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 9 + eor v14.16b, v14.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 9 + + ldp q8, q9, [x0], #32 //AES block 8k+8, 8k+9 - load ciphertext + eor v12.16b, v12.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 9 + + pmull2 v13.1q, v14.2d, v21.2d //GHASH block 8k+6 - mid + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 9 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 9 + + pmull2 v16.1q, v12.2d, v24.2d //GHASH block 8k+4 - mid + pmull v24.1q, v12.1d, v24.1d //GHASH block 8k+5 - mid + pmull2 v12.1q, v15.2d, v20.2d //GHASH block 8k+7 - high + + pmull v20.1q, v15.1d, v20.1d //GHASH block 8k+7 - low + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 10 + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 10 + + pmull v21.1q, v14.1d, v21.1d //GHASH block 8k+7 - mid + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 9 +.inst 0xce195e73 //eor3 v19.16b, v19.16b, v25.16b, v23.16b //GHASH block 8k+4, 8k+5 - low + + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 9 +.inst 0xce184252 //eor3 v18.16b, v18.16b, v24.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid +.inst 0xce0b3231 //eor3 v17.16b, v17.16b, v11.16b, v12.16b //GHASH block 8k+6, 8k+7 - high + + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 10 + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 10 + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 10 + + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 10 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 10 + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 10 + +.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+6, 8k+7 - low + rev32 v20.16b, v30.16b //CTR block 8k+16 + ldr d16, [x10] //MODULO - load modulo constant + + add v30.4s, v30.4s, v31.4s //CTR block 8k+16 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 11 + ldp q26, q27, [x8, #192] //load rk12, rk13 + + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 11 + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 11 + +.inst 0xce153652 //eor3 v18.16b, v18.16b, v21.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid + rev32 v22.16b, v30.16b //CTR block 8k+17 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 11 + + ldp q10, q11, [x0], #32 //AES block 8k+10, 8k+11 - load ciphertext + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 11 + ext v21.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment + + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 11 + add v30.4s, v30.4s, v31.4s //CTR block 8k+17 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 11 + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 12 + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 12 + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 12 + + rev32 v23.16b, v30.16b //CTR block 8k+18 + add v30.4s, v30.4s, v31.4s //CTR block 8k+18 + pmull v29.1q, v17.1d, v16.1d //MODULO - top 64b align with mid + +.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 12 + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 11 + + ldr q28, [x8, #224] //load rk14 + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 12 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 12 + +.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 12 + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 12 + + ldp q12, q13, [x0], #32 //AES block 8k+12, 8k+13 - load ciphertext + aese v1.16b, v27.16b //AES block 8k+9 - round 13 + aese v2.16b, v27.16b //AES block 8k+10 - round 13 + + ldp q14, q15, [x0], #32 //AES block 8k+14, 8k+15 - load ciphertext + aese v0.16b, v27.16b //AES block 8k+8 - round 13 + aese v5.16b, v27.16b //AES block 8k+13 - round 13 + + rev32 v25.16b, v30.16b //CTR block 8k+19 +.inst 0xce027142 //eor3 v2.16b, v10.16b, v2.16b, v28.16b //AES block 8k+10 - result +.inst 0xce017121 //eor3 v1.16b, v9.16b, v1.16b, v28.16b //AES block 8k+9 - result + + ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment + aese v7.16b, v27.16b //AES block 8k+15 - round 13 + + add v30.4s, v30.4s, v31.4s //CTR block 8k+19 + pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low + aese v4.16b, v27.16b //AES block 8k+12 - round 13 + +.inst 0xce0571a5 //eor3 v5.16b, v13.16b, v5.16b, v28.16b //AES block 8k+13 - result +.inst 0xce007100 //eor3 v0.16b, v8.16b, v0.16b, v28.16b //AES block 8k+8 - result + aese v3.16b, v27.16b //AES block 8k+11 - round 13 + + stp q0, q1, [x2], #32 //AES block 8k+8, 8k+9 - store result + mov v0.16b, v20.16b //CTR block 8k+16 +.inst 0xce047184 //eor3 v4.16b, v12.16b, v4.16b, v28.16b //AES block 8k+12 - result + +.inst 0xce154673 //eor3 v19.16b, v19.16b, v21.16b, v17.16b //MODULO - fold into low +.inst 0xce037163 //eor3 v3.16b, v11.16b, v3.16b, v28.16b //AES block 8k+11 - result + stp q2, q3, [x2], #32 //AES block 8k+10, 8k+11 - store result + + mov v3.16b, v25.16b //CTR block 8k+19 + mov v2.16b, v23.16b //CTR block 8k+18 + aese v6.16b, v27.16b //AES block 8k+14 - round 13 + + mov v1.16b, v22.16b //CTR block 8k+17 + stp q4, q5, [x2], #32 //AES block 8k+12, 8k+13 - store result +.inst 0xce0771e7 //eor3 v7.16b, v15.16b, v7.16b, v28.16b //AES block 8k+15 - result + +.inst 0xce0671c6 //eor3 v6.16b, v14.16b, v6.16b, v28.16b //AES block 8k+14 - result + rev32 v4.16b, v30.16b //CTR block 8k+20 + add v30.4s, v30.4s, v31.4s //CTR block 8k+20 + + cmp x0, x5 //.LOOP CONTROL + stp q6, q7, [x2], #32 //AES block 8k+14, 8k+15 - store result + b.lt .L256_dec_main_loop + +.L256_dec_prepretail: //PREPRETAIL + ldp q26, q27, [x8, #0] //load rk0, rk1 + rev32 v5.16b, v30.16b //CTR block 8k+13 + add v30.4s, v30.4s, v31.4s //CTR block 8k+13 + + rev64 v12.16b, v12.16b //GHASH block 8k+4 + ldr q21, [x3, #144] //load h6k | h5k + ldr q24, [x3, #192] //load h8k | h7k + + rev32 v6.16b, v30.16b //CTR block 8k+14 + rev64 v8.16b, v8.16b //GHASH block 8k + add v30.4s, v30.4s, v31.4s //CTR block 8k+14 + + ext v19.16b, v19.16b, v19.16b, #8 //PRE 0 + ldr q23, [x3, #176] //load h7l | h7h + ext v23.16b, v23.16b, v23.16b, #8 + ldr q25, [x3, #208] //load h8l | h8h + ext v25.16b, v25.16b, v25.16b, #8 + rev64 v9.16b, v9.16b //GHASH block 8k+1 + + rev32 v7.16b, v30.16b //CTR block 8k+15 + rev64 v10.16b, v10.16b //GHASH block 8k+2 + ldr q20, [x3, #128] //load h5l | h5h + ext v20.16b, v20.16b, v20.16b, #8 + ldr q22, [x3, #160] //load h6l | h6h + ext v22.16b, v22.16b, v22.16b, #8 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 0 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 0 + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 0 + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 0 + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 0 + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 0 + + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 1 + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 0 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 0 + + ldp q28, q26, [x8, #32] //load rk2, rk3 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 1 + eor v8.16b, v8.16b, v19.16b //PRE 1 + + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 1 + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 1 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 1 + + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 1 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 1 + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 1 + + pmull2 v16.1q, v9.2d, v23.2d //GHASH block 8k+1 - high + trn1 v18.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid + pmull v19.1q, v8.1d, v25.1d //GHASH block 8k - low + + rev64 v11.16b, v11.16b //GHASH block 8k+3 + pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low + + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 2 + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 2 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 2 + + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 2 + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 2 + pmull2 v17.1q, v8.2d, v25.2d //GHASH block 8k - high + + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 2 + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 3 + + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 3 + rev64 v14.16b, v14.16b //GHASH block 8k+6 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 3 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 2 + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 3 + + pmull2 v29.1q, v10.2d, v22.2d //GHASH block 8k+2 - high + trn2 v8.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 2 + + ldp q27, q28, [x8, #64] //load rk4, rk5 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 3 + pmull2 v9.1q, v11.2d, v20.2d //GHASH block 8k+3 - high + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 3 + eor v17.16b, v17.16b, v16.16b //GHASH block 8k+1 - high + eor v8.16b, v8.16b, v18.16b //GHASH block 8k, 8k+1 - mid + + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 3 + pmull v22.1q, v10.1d, v22.1d //GHASH block 8k+2 - low + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 3 + +.inst 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b, v9.16b //GHASH block 8k+2, 8k+3 - high + trn1 v29.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid + trn2 v10.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid + + pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid + pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low + eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low + + pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 4 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 4 + +.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low + ldr q20, [x3, #32] //load h1l | h1h + ext v20.16b, v20.16b, v20.16b, #8 + ldr q22, [x3, #64] //load h2l | h2h + ext v22.16b, v22.16b, v22.16b, #8 + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 4 + + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 4 + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 4 + eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid + + eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 5 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 4 + + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 5 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 4 + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 4 + + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 5 + pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 5 + + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 5 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 5 + pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid + + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 5 + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 5 + ldp q26, q27, [x8, #96] //load rk6, rk7 + + ldr q23, [x3, #80] //load h3l | h3h + ext v23.16b, v23.16b, v23.16b, #8 + ldr q25, [x3, #112] //load h4l | h4h + ext v25.16b, v25.16b, v25.16b, #8 + rev64 v15.16b, v15.16b //GHASH block 8k+7 + rev64 v13.16b, v13.16b //GHASH block 8k+5 + +.inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid + + trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 6 + ldr q21, [x3, #48] //load h2k | h1k + ldr q24, [x3, #96] //load h4k | h3k + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 6 + + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 6 + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 6 + + pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high + pmull2 v10.1q, v13.2d, v23.2d //GHASH block 8k+5 - high + pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low + + trn2 v12.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid + pmull v23.1q, v13.1d, v23.1d //GHASH block 8k+5 - low + trn1 v13.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid + + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 7 + pmull2 v11.1q, v14.2d, v22.2d //GHASH block 8k+6 - high + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 6 + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 6 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 6 + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 6 + + ldp q28, q26, [x8, #128] //load rk8, rk9 + pmull v22.1q, v14.1d, v22.1d //GHASH block 8k+6 - low + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 7 + + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 7 + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 7 + + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 7 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 7 +.inst 0xce082a31 //eor3 v17.16b, v17.16b, v8.16b, v10.16b //GHASH block 8k+4, 8k+5 - high + + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 7 + trn2 v14.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 7 + + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 8 + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 8 + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 8 + + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 8 + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 8 + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 8 + + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 8 + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 9 + eor v12.16b, v12.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 9 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 9 + eor v14.16b, v14.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid + + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 9 + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 9 + pmull2 v16.1q, v12.2d, v24.2d //GHASH block 8k+4 - mid + + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 8 + pmull v24.1q, v12.1d, v24.1d //GHASH block 8k+5 - mid + pmull2 v12.1q, v15.2d, v20.2d //GHASH block 8k+7 - high + + pmull2 v13.1q, v14.2d, v21.2d //GHASH block 8k+6 - mid + pmull v21.1q, v14.1d, v21.1d //GHASH block 8k+7 - mid + pmull v20.1q, v15.1d, v20.1d //GHASH block 8k+7 - low + + ldp q27, q28, [x8, #160] //load rk10, rk11 +.inst 0xce195e73 //eor3 v19.16b, v19.16b, v25.16b, v23.16b //GHASH block 8k+4, 8k+5 - low +.inst 0xce184252 //eor3 v18.16b, v18.16b, v24.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 9 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 9 + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 9 + +.inst 0xce0b3231 //eor3 v17.16b, v17.16b, v11.16b, v12.16b //GHASH block 8k+6, 8k+7 - high +.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+6, 8k+7 - low + ldr d16, [x10] //MODULO - load modulo constant + +.inst 0xce153652 //eor3 v18.16b, v18.16b, v21.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid + + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 10 + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 10 + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 10 + + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 10 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 10 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 10 + +.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up + + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 10 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 10 + ldp q26, q27, [x8, #192] //load rk12, rk13 + + ext v21.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment + + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 11 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 11 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 11 + + pmull v29.1q, v17.1d, v16.1d //MODULO - top 64b align with mid + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 11 + + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 11 + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 11 + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 11 + + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 11 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 12 + +.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid + + aese v3.16b, v27.16b //AES block 8k+11 - round 13 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 12 + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 12 + + pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 12 + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 12 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 12 + ldr q28, [x8, #224] //load rk14 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 12 + + aese v4.16b, v27.16b //AES block 8k+12 - round 13 + ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 12 + + aese v6.16b, v27.16b //AES block 8k+14 - round 13 + aese v2.16b, v27.16b //AES block 8k+10 - round 13 + aese v1.16b, v27.16b //AES block 8k+9 - round 13 + + aese v5.16b, v27.16b //AES block 8k+13 - round 13 +.inst 0xce154673 //eor3 v19.16b, v19.16b, v21.16b, v17.16b //MODULO - fold into low + add v30.4s, v30.4s, v31.4s //CTR block 8k+15 + + aese v7.16b, v27.16b //AES block 8k+15 - round 13 + aese v0.16b, v27.16b //AES block 8k+8 - round 13 +.L256_dec_tail: //TAIL + + ext v16.16b, v19.16b, v19.16b, #8 //prepare final partial tag + sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process + cmp x5, #112 + + ldr q9, [x0], #16 //AES block 8k+8 - load ciphertext + + ldp q24, q25, [x3, #192] //load h8k | h7k + ext v25.16b, v25.16b, v25.16b, #8 + mov v29.16b, v28.16b + + ldp q20, q21, [x3, #128] //load h5l | h5h + ext v20.16b, v20.16b, v20.16b, #8 + +.inst 0xce00752c //eor3 v12.16b, v9.16b, v0.16b, v29.16b //AES block 8k+8 - result + ldp q22, q23, [x3, #160] //load h6l | h6h + ext v22.16b, v22.16b, v22.16b, #8 + ext v23.16b, v23.16b, v23.16b, #8 + b.gt .L256_dec_blocks_more_than_7 + + mov v7.16b, v6.16b + sub v30.4s, v30.4s, v31.4s + mov v6.16b, v5.16b + + mov v5.16b, v4.16b + mov v4.16b, v3.16b + movi v19.8b, #0 + + movi v17.8b, #0 + movi v18.8b, #0 + mov v3.16b, v2.16b + + cmp x5, #96 + mov v2.16b, v1.16b + b.gt .L256_dec_blocks_more_than_6 + + mov v7.16b, v6.16b + mov v6.16b, v5.16b + + mov v5.16b, v4.16b + cmp x5, #80 + sub v30.4s, v30.4s, v31.4s + + mov v4.16b, v3.16b + mov v3.16b, v1.16b + b.gt .L256_dec_blocks_more_than_5 + + cmp x5, #64 + mov v7.16b, v6.16b + sub v30.4s, v30.4s, v31.4s + + mov v6.16b, v5.16b + + mov v5.16b, v4.16b + mov v4.16b, v1.16b + b.gt .L256_dec_blocks_more_than_4 + + sub v30.4s, v30.4s, v31.4s + mov v7.16b, v6.16b + cmp x5, #48 + + mov v6.16b, v5.16b + mov v5.16b, v1.16b + b.gt .L256_dec_blocks_more_than_3 + + ldr q24, [x3, #96] //load h4k | h3k + sub v30.4s, v30.4s, v31.4s + mov v7.16b, v6.16b + + cmp x5, #32 + mov v6.16b, v1.16b + b.gt .L256_dec_blocks_more_than_2 + + sub v30.4s, v30.4s, v31.4s + + mov v7.16b, v1.16b + cmp x5, #16 + b.gt .L256_dec_blocks_more_than_1 + + sub v30.4s, v30.4s, v31.4s + ldr q21, [x3, #48] //load h2k | h1k + b .L256_dec_blocks_less_than_1 +.L256_dec_blocks_more_than_7: //blocks left > 7 + rev64 v8.16b, v9.16b //GHASH final-7 block + ldr q9, [x0], #16 //AES final-6 block - load ciphertext + st1 { v12.16b}, [x2], #16 //AES final-7 block - store result + + ins v18.d[0], v24.d[1] //GHASH final-7 block - mid + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + + ins v27.d[0], v8.d[1] //GHASH final-7 block - mid +.inst 0xce01752c //eor3 v12.16b, v9.16b, v1.16b, v29.16b //AES final-6 block - result + + pmull2 v17.1q, v8.2d, v25.2d //GHASH final-7 block - high + + eor v27.8b, v27.8b, v8.8b //GHASH final-7 block - mid + movi v16.8b, #0 //suppress further partial tag feed in + + pmull v19.1q, v8.1d, v25.1d //GHASH final-7 block - low + pmull v18.1q, v27.1d, v18.1d //GHASH final-7 block - mid +.L256_dec_blocks_more_than_6: //blocks left > 6 + + rev64 v8.16b, v9.16b //GHASH final-6 block + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + ldr q9, [x0], #16 //AES final-5 block - load ciphertext + movi v16.8b, #0 //suppress further partial tag feed in + + ins v27.d[0], v8.d[1] //GHASH final-6 block - mid + st1 { v12.16b}, [x2], #16 //AES final-6 block - store result + pmull2 v28.1q, v8.2d, v23.2d //GHASH final-6 block - high + + pmull v26.1q, v8.1d, v23.1d //GHASH final-6 block - low + +.inst 0xce02752c //eor3 v12.16b, v9.16b, v2.16b, v29.16b //AES final-5 block - result + eor v19.16b, v19.16b, v26.16b //GHASH final-6 block - low + eor v27.8b, v27.8b, v8.8b //GHASH final-6 block - mid + + pmull v27.1q, v27.1d, v24.1d //GHASH final-6 block - mid + + eor v18.16b, v18.16b, v27.16b //GHASH final-6 block - mid + eor v17.16b, v17.16b, v28.16b //GHASH final-6 block - high +.L256_dec_blocks_more_than_5: //blocks left > 5 + + rev64 v8.16b, v9.16b //GHASH final-5 block + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + + pmull2 v28.1q, v8.2d, v22.2d //GHASH final-5 block - high + ins v27.d[0], v8.d[1] //GHASH final-5 block - mid + + ldr q9, [x0], #16 //AES final-4 block - load ciphertext + + eor v27.8b, v27.8b, v8.8b //GHASH final-5 block - mid + st1 { v12.16b}, [x2], #16 //AES final-5 block - store result + + pmull v26.1q, v8.1d, v22.1d //GHASH final-5 block - low + ins v27.d[1], v27.d[0] //GHASH final-5 block - mid + + pmull2 v27.1q, v27.2d, v21.2d //GHASH final-5 block - mid + + eor v17.16b, v17.16b, v28.16b //GHASH final-5 block - high +.inst 0xce03752c //eor3 v12.16b, v9.16b, v3.16b, v29.16b //AES final-4 block - result + eor v19.16b, v19.16b, v26.16b //GHASH final-5 block - low + + eor v18.16b, v18.16b, v27.16b //GHASH final-5 block - mid + movi v16.8b, #0 //suppress further partial tag feed in +.L256_dec_blocks_more_than_4: //blocks left > 4 + + rev64 v8.16b, v9.16b //GHASH final-4 block + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + + ins v27.d[0], v8.d[1] //GHASH final-4 block - mid + ldr q9, [x0], #16 //AES final-3 block - load ciphertext + + movi v16.8b, #0 //suppress further partial tag feed in + + pmull v26.1q, v8.1d, v20.1d //GHASH final-4 block - low + pmull2 v28.1q, v8.2d, v20.2d //GHASH final-4 block - high + + eor v27.8b, v27.8b, v8.8b //GHASH final-4 block - mid + + eor v17.16b, v17.16b, v28.16b //GHASH final-4 block - high + + pmull v27.1q, v27.1d, v21.1d //GHASH final-4 block - mid + + eor v19.16b, v19.16b, v26.16b //GHASH final-4 block - low + st1 { v12.16b}, [x2], #16 //AES final-4 block - store result + + eor v18.16b, v18.16b, v27.16b //GHASH final-4 block - mid +.inst 0xce04752c //eor3 v12.16b, v9.16b, v4.16b, v29.16b //AES final-3 block - result +.L256_dec_blocks_more_than_3: //blocks left > 3 + + ldr q25, [x3, #112] //load h4l | h4h + ext v25.16b, v25.16b, v25.16b, #8 + rev64 v8.16b, v9.16b //GHASH final-3 block + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + ldr q9, [x0], #16 //AES final-2 block - load ciphertext + ldr q24, [x3, #96] //load h4k | h3k + + ins v27.d[0], v8.d[1] //GHASH final-3 block - mid + st1 { v12.16b}, [x2], #16 //AES final-3 block - store result + +.inst 0xce05752c //eor3 v12.16b, v9.16b, v5.16b, v29.16b //AES final-2 block - result + + eor v27.8b, v27.8b, v8.8b //GHASH final-3 block - mid + + ins v27.d[1], v27.d[0] //GHASH final-3 block - mid + pmull v26.1q, v8.1d, v25.1d //GHASH final-3 block - low + pmull2 v28.1q, v8.2d, v25.2d //GHASH final-3 block - high + + movi v16.8b, #0 //suppress further partial tag feed in + pmull2 v27.1q, v27.2d, v24.2d //GHASH final-3 block - mid + eor v19.16b, v19.16b, v26.16b //GHASH final-3 block - low + + eor v17.16b, v17.16b, v28.16b //GHASH final-3 block - high + + eor v18.16b, v18.16b, v27.16b //GHASH final-3 block - mid +.L256_dec_blocks_more_than_2: //blocks left > 2 + + rev64 v8.16b, v9.16b //GHASH final-2 block + + ldr q23, [x3, #80] //load h3l | h3h + ext v23.16b, v23.16b, v23.16b, #8 + ldr q9, [x0], #16 //AES final-1 block - load ciphertext + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + + ins v27.d[0], v8.d[1] //GHASH final-2 block - mid + + pmull v26.1q, v8.1d, v23.1d //GHASH final-2 block - low + st1 { v12.16b}, [x2], #16 //AES final-2 block - store result +.inst 0xce06752c //eor3 v12.16b, v9.16b, v6.16b, v29.16b //AES final-1 block - result + + eor v27.8b, v27.8b, v8.8b //GHASH final-2 block - mid + eor v19.16b, v19.16b, v26.16b //GHASH final-2 block - low + movi v16.8b, #0 //suppress further partial tag feed in + + pmull v27.1q, v27.1d, v24.1d //GHASH final-2 block - mid + pmull2 v28.1q, v8.2d, v23.2d //GHASH final-2 block - high + + eor v18.16b, v18.16b, v27.16b //GHASH final-2 block - mid + eor v17.16b, v17.16b, v28.16b //GHASH final-2 block - high +.L256_dec_blocks_more_than_1: //blocks left > 1 + + rev64 v8.16b, v9.16b //GHASH final-1 block + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + + ins v27.d[0], v8.d[1] //GHASH final-1 block - mid + ldr q22, [x3, #64] //load h2l | h2h + ext v22.16b, v22.16b, v22.16b, #8 + + eor v27.8b, v27.8b, v8.8b //GHASH final-1 block - mid + ldr q9, [x0], #16 //AES final block - load ciphertext + st1 { v12.16b}, [x2], #16 //AES final-1 block - store result + + ldr q21, [x3, #48] //load h2k | h1k + pmull v26.1q, v8.1d, v22.1d //GHASH final-1 block - low + + ins v27.d[1], v27.d[0] //GHASH final-1 block - mid + + eor v19.16b, v19.16b, v26.16b //GHASH final-1 block - low + +.inst 0xce07752c //eor3 v12.16b, v9.16b, v7.16b, v29.16b //AES final block - result + pmull2 v28.1q, v8.2d, v22.2d //GHASH final-1 block - high + + pmull2 v27.1q, v27.2d, v21.2d //GHASH final-1 block - mid + + movi v16.8b, #0 //suppress further partial tag feed in + eor v17.16b, v17.16b, v28.16b //GHASH final-1 block - high + + eor v18.16b, v18.16b, v27.16b //GHASH final-1 block - mid +.L256_dec_blocks_less_than_1: //blocks left <= 1 + + ld1 { v26.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored + mvn x6, xzr //temp0_x = 0xffffffffffffffff + and x1, x1, #127 //bit_length %= 128 + + sub x1, x1, #128 //bit_length -= 128 + rev32 v30.16b, v30.16b + str q30, [x16] //store the updated counter + + neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128]) + + and x1, x1, #127 //bit_length %= 128 + + lsr x6, x6, x1 //temp0_x is mask for top 64b of last block + cmp x1, #64 + mvn x7, xzr //temp1_x = 0xffffffffffffffff + + csel x14, x6, xzr, lt + csel x13, x7, x6, lt + + mov v0.d[0], x13 //ctr0b is mask for last block + mov v0.d[1], x14 + + and v9.16b, v9.16b, v0.16b //possibly partial last block has zeroes in highest bits + ldr q20, [x3, #32] //load h1l | h1h + ext v20.16b, v20.16b, v20.16b, #8 + bif v12.16b, v26.16b, v0.16b //insert existing bytes in top end of result before storing + + rev64 v8.16b, v9.16b //GHASH final block + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + + ins v16.d[0], v8.d[1] //GHASH final block - mid + pmull2 v28.1q, v8.2d, v20.2d //GHASH final block - high + + eor v16.8b, v16.8b, v8.8b //GHASH final block - mid + + pmull v26.1q, v8.1d, v20.1d //GHASH final block - low + eor v17.16b, v17.16b, v28.16b //GHASH final block - high + + pmull v16.1q, v16.1d, v21.1d //GHASH final block - mid + + eor v18.16b, v18.16b, v16.16b //GHASH final block - mid + ldr d16, [x10] //MODULO - load modulo constant + eor v19.16b, v19.16b, v26.16b //GHASH final block - low + + pmull v21.1q, v17.1d, v16.1d //MODULO - top 64b align with mid + eor v14.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up + + ext v17.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment + st1 { v12.16b}, [x2] //store all 16B + + eor v18.16b, v18.16b, v14.16b //MODULO - karatsuba tidy up + + eor v21.16b, v17.16b, v21.16b //MODULO - fold into mid + eor v18.16b, v18.16b, v21.16b //MODULO - fold into mid + + pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low + + ext v18.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment + eor v19.16b, v19.16b, v17.16b //MODULO - fold into low + + eor v19.16b, v19.16b, v18.16b //MODULO - fold into low + ext v19.16b, v19.16b, v19.16b, #8 + rev64 v19.16b, v19.16b + st1 { v19.16b }, [x3] + mov x0, x9 + + ldp d10, d11, [sp, #16] + ldp d12, d13, [sp, #32] + ldp d14, d15, [sp, #48] + ldp d8, d9, [sp], #80 + ret + +.L256_dec_ret: + mov w0, #0x0 + ret +.size unroll8_eor3_aes_gcm_dec_256_kernel,.-unroll8_eor3_aes_gcm_dec_256_kernel +.byte 65,69,83,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,65,82,77,118,56,44,32,83,80,68,88,32,66,83,68,45,51,45,67,108,97,117,115,101,32,98,121,32,60,120,105,97,111,107,97,110,103,46,113,105,97,110,64,97,114,109,46,99,111,109,62,0 +.align 2 +.align 2 +#endif diff --git a/sys/crypto/openssl/aarch64/aes-gcm-armv8_64.S b/sys/crypto/openssl/aarch64/aes-gcm-armv8_64.S index 55856548fa6f..d8082ccbe0a7 100644 --- a/sys/crypto/openssl/aarch64/aes-gcm-armv8_64.S +++ b/sys/crypto/openssl/aarch64/aes-gcm-armv8_64.S @@ -6390,6 +6390,7 @@ aes_gcm_dec_256_kernel: mov w0, #0x0 ret .size aes_gcm_dec_256_kernel,.-aes_gcm_dec_256_kernel +.section .rodata .byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 2 .align 2 diff --git a/sys/crypto/openssl/aarch64/aesv8-armx.S b/sys/crypto/openssl/aarch64/aesv8-armx.S index 015c2eea6dbb..d46d1f0a208c 100644 --- a/sys/crypto/openssl/aarch64/aesv8-armx.S +++ b/sys/crypto/openssl/aarch64/aesv8-armx.S @@ -4,12 +4,13 @@ #if __ARM_MAX_ARCH__>=7 .arch armv8-a+crypto .text +.section .rodata .align 5 .Lrcon: .long 0x01,0x01,0x01,0x01 .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat .long 0x1b,0x1b,0x1b,0x1b - +.previous .globl aes_v8_set_encrypt_key .type aes_v8_set_encrypt_key,%function .align 5 @@ -32,7 +33,8 @@ aes_v8_set_encrypt_key: tst w1,#0x3f b.ne .Lenc_key_abort - adr x3,.Lrcon + adrp x3,.Lrcon + add x3,x3,#:lo12:.Lrcon cmp w1,#192 eor v0.16b,v0.16b,v0.16b @@ -1509,6 +1511,729 @@ aes_v8_cbc_encrypt: ldr x29,[sp],#16 ret .size aes_v8_cbc_encrypt,.-aes_v8_cbc_encrypt +.globl aes_v8_ctr32_encrypt_blocks_unroll12_eor3 +.type aes_v8_ctr32_encrypt_blocks_unroll12_eor3,%function +.align 5 +aes_v8_ctr32_encrypt_blocks_unroll12_eor3: + AARCH64_VALID_CALL_TARGET + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + stp x29,x30,[sp,#-80]! + stp d8,d9,[sp, #16] + stp d10,d11,[sp, #32] + stp d12,d13,[sp, #48] + stp d14,d15,[sp, #64] + add x29,sp,#0 + + ldr w5,[x3,#240] + + ldr w8, [x4, #12] +#ifdef __AARCH64EB__ + ld1 {v24.16b},[x4] +#else + ld1 {v24.4s},[x4] +#endif + ld1 {v2.4s,v3.4s},[x3] // load key schedule... + sub w5,w5,#4 + cmp x2,#2 + add x7,x3,x5,lsl#4 // pointer to last round key + sub w5,w5,#2 + add x7, x7, #64 + ld1 {v1.4s},[x7] + add x7,x3,#32 + mov w6,w5 +#ifndef __AARCH64EB__ + rev w8, w8 +#endif + + orr v25.16b,v24.16b,v24.16b + add w10, w8, #1 + orr v26.16b,v24.16b,v24.16b + add w8, w8, #2 + orr v0.16b,v24.16b,v24.16b + rev w10, w10 + mov v25.s[3],w10 + b.ls .Lctr32_tail_unroll + cmp x2,#6 + rev w12, w8 + sub x2,x2,#3 // bias + mov v26.s[3],w12 + b.lo .Loop3x_ctr32_unroll + cmp x2,#9 + orr v27.16b,v24.16b,v24.16b + add w11, w8, #1 + orr v28.16b,v24.16b,v24.16b + add w13, w8, #2 + rev w11, w11 + orr v29.16b,v24.16b,v24.16b + add w8, w8, #3 + rev w13, w13 + mov v27.s[3],w11 + rev w14, w8 + mov v28.s[3],w13 + mov v29.s[3],w14 + sub x2,x2,#3 + b.lo .Loop6x_ctr32_unroll + + // push regs to stack when 12 data chunks are interleaved + stp x19,x20,[sp,#-16]! + stp x21,x22,[sp,#-16]! + stp x23,x24,[sp,#-16]! + stp d8,d9,[sp,#-32]! + stp d10,d11,[sp,#-32]! + + add w15,w8,#1 + add w19,w8,#2 + add w20,w8,#3 + add w21,w8,#4 + add w22,w8,#5 + add w8,w8,#6 + orr v30.16b,v24.16b,v24.16b + rev w15,w15 + orr v31.16b,v24.16b,v24.16b + rev w19,w19 + orr v8.16b,v24.16b,v24.16b + rev w20,w20 + orr v9.16b,v24.16b,v24.16b + rev w21,w21 + orr v10.16b,v24.16b,v24.16b + rev w22,w22 + orr v11.16b,v24.16b,v24.16b + rev w23,w8 + + sub x2,x2,#6 // bias + mov v30.s[3],w15 + mov v31.s[3],w19 + mov v8.s[3],w20 + mov v9.s[3],w21 + mov v10.s[3],w22 + mov v11.s[3],w23 + b .Loop12x_ctr32_unroll + +.align 4 +.Loop12x_ctr32_unroll: + aese v24.16b,v2.16b + aesmc v24.16b,v24.16b + aese v25.16b,v2.16b + aesmc v25.16b,v25.16b + aese v26.16b,v2.16b + aesmc v26.16b,v26.16b + aese v27.16b,v2.16b + aesmc v27.16b,v27.16b + aese v28.16b,v2.16b + aesmc v28.16b,v28.16b + aese v29.16b,v2.16b + aesmc v29.16b,v29.16b + aese v30.16b,v2.16b + aesmc v30.16b,v30.16b + aese v31.16b,v2.16b + aesmc v31.16b,v31.16b + aese v8.16b,v2.16b + aesmc v8.16b,v8.16b + aese v9.16b,v2.16b + aesmc v9.16b,v9.16b + aese v10.16b,v2.16b + aesmc v10.16b,v10.16b + aese v11.16b,v2.16b + aesmc v11.16b,v11.16b + ld1 {v2.4s},[x7],#16 + subs w6,w6,#2 + aese v24.16b,v3.16b + aesmc v24.16b,v24.16b + aese v25.16b,v3.16b + aesmc v25.16b,v25.16b + aese v26.16b,v3.16b + aesmc v26.16b,v26.16b + aese v27.16b,v3.16b + aesmc v27.16b,v27.16b + aese v28.16b,v3.16b + aesmc v28.16b,v28.16b + aese v29.16b,v3.16b + aesmc v29.16b,v29.16b + aese v30.16b,v3.16b + aesmc v30.16b,v30.16b + aese v31.16b,v3.16b + aesmc v31.16b,v31.16b + aese v8.16b,v3.16b + aesmc v8.16b,v8.16b + aese v9.16b,v3.16b + aesmc v9.16b,v9.16b + aese v10.16b,v3.16b + aesmc v10.16b,v10.16b + aese v11.16b,v3.16b + aesmc v11.16b,v11.16b + ld1 {v3.4s},[x7],#16 + b.gt .Loop12x_ctr32_unroll + + aese v24.16b,v2.16b + aesmc v24.16b,v24.16b + aese v25.16b,v2.16b + aesmc v25.16b,v25.16b + aese v26.16b,v2.16b + aesmc v26.16b,v26.16b + aese v27.16b,v2.16b + aesmc v27.16b,v27.16b + aese v28.16b,v2.16b + aesmc v28.16b,v28.16b + aese v29.16b,v2.16b + aesmc v29.16b,v29.16b + aese v30.16b,v2.16b + aesmc v30.16b,v30.16b + aese v31.16b,v2.16b + aesmc v31.16b,v31.16b + aese v8.16b,v2.16b + aesmc v8.16b,v8.16b + aese v9.16b,v2.16b + aesmc v9.16b,v9.16b + aese v10.16b,v2.16b + aesmc v10.16b,v10.16b + aese v11.16b,v2.16b + aesmc v11.16b,v11.16b + ld1 {v2.4s},[x7],#16 + + aese v24.16b,v3.16b + aesmc v24.16b,v24.16b + aese v25.16b,v3.16b + aesmc v25.16b,v25.16b + aese v26.16b,v3.16b + aesmc v26.16b,v26.16b + aese v27.16b,v3.16b + aesmc v27.16b,v27.16b + aese v28.16b,v3.16b + aesmc v28.16b,v28.16b + aese v29.16b,v3.16b + aesmc v29.16b,v29.16b + aese v30.16b,v3.16b + aesmc v30.16b,v30.16b + aese v31.16b,v3.16b + aesmc v31.16b,v31.16b + aese v8.16b,v3.16b + aesmc v8.16b,v8.16b + aese v9.16b,v3.16b + aesmc v9.16b,v9.16b + aese v10.16b,v3.16b + aesmc v10.16b,v10.16b + aese v11.16b,v3.16b + aesmc v11.16b,v11.16b + ld1 {v3.4s},[x7],#16 + + aese v24.16b,v2.16b + aesmc v24.16b,v24.16b + add w9,w8,#1 + add w10,w8,#2 + aese v25.16b,v2.16b + aesmc v25.16b,v25.16b + add w12,w8,#3 + add w11,w8,#4 + aese v26.16b,v2.16b + aesmc v26.16b,v26.16b + add w13,w8,#5 + add w14,w8,#6 + rev w9,w9 + aese v27.16b,v2.16b + aesmc v27.16b,v27.16b + add w15,w8,#7 + add w19,w8,#8 + rev w10,w10 + rev w12,w12 + aese v28.16b,v2.16b + aesmc v28.16b,v28.16b + add w20,w8,#9 + add w21,w8,#10 + rev w11,w11 + rev w13,w13 + aese v29.16b,v2.16b + aesmc v29.16b,v29.16b + add w22,w8,#11 + add w23,w8,#12 + rev w14,w14 + rev w15,w15 + aese v30.16b,v2.16b + aesmc v30.16b,v30.16b + rev w19,w19 + rev w20,w20 + aese v31.16b,v2.16b + aesmc v31.16b,v31.16b + rev w21,w21 + rev w22,w22 + aese v8.16b,v2.16b + aesmc v8.16b,v8.16b + rev w23,w23 + aese v9.16b,v2.16b + aesmc v9.16b,v9.16b + aese v10.16b,v2.16b + aesmc v10.16b,v10.16b + aese v11.16b,v2.16b + aesmc v11.16b,v11.16b + ld1 {v2.4s},[x7],#16 + + aese v24.16b,v3.16b + aesmc v24.16b,v24.16b + aese v25.16b,v3.16b + aesmc v25.16b,v25.16b + aese v26.16b,v3.16b + aesmc v26.16b,v26.16b + aese v27.16b,v3.16b + aesmc v27.16b,v27.16b + ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 + aese v28.16b,v3.16b + aesmc v28.16b,v28.16b + aese v29.16b,v3.16b + aesmc v29.16b,v29.16b + aese v30.16b,v3.16b + aesmc v30.16b,v30.16b + aese v31.16b,v3.16b + aesmc v31.16b,v31.16b + ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 + aese v8.16b,v3.16b + aesmc v8.16b,v8.16b + aese v9.16b,v3.16b + aesmc v9.16b,v9.16b + aese v10.16b,v3.16b + aesmc v10.16b,v10.16b + aese v11.16b,v3.16b + aesmc v11.16b,v11.16b + ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64 + ld1 {v3.4s},[x7],#16 + + mov x7, x3 + aese v24.16b,v2.16b + aesmc v24.16b,v24.16b + aese v25.16b,v2.16b + aesmc v25.16b,v25.16b + aese v26.16b,v2.16b + aesmc v26.16b,v26.16b + aese v27.16b,v2.16b + aesmc v27.16b,v27.16b + aese v28.16b,v2.16b + aesmc v28.16b,v28.16b + aese v29.16b,v2.16b + aesmc v29.16b,v29.16b + aese v30.16b,v2.16b + aesmc v30.16b,v30.16b + aese v31.16b,v2.16b + aesmc v31.16b,v31.16b + aese v8.16b,v2.16b + aesmc v8.16b,v8.16b + aese v9.16b,v2.16b + aesmc v9.16b,v9.16b + aese v10.16b,v2.16b + aesmc v10.16b,v10.16b + aese v11.16b,v2.16b + aesmc v11.16b,v11.16b + ld1 {v2.4s},[x7],#16 // re-pre-load rndkey[0] + + aese v24.16b,v3.16b +.inst 0xce016084 //eor3 v4.16b,v4.16b,v1.16b,v24.16b + orr v24.16b,v0.16b,v0.16b + aese v25.16b,v3.16b +.inst 0xce0164a5 //eor3 v5.16b,v5.16b,v1.16b,v25.16b + orr v25.16b,v0.16b,v0.16b + aese v26.16b,v3.16b +.inst 0xce0168c6 //eor3 v6.16b,v6.16b,v1.16b,v26.16b + orr v26.16b,v0.16b,v0.16b + aese v27.16b,v3.16b +.inst 0xce016ce7 //eor3 v7.16b,v7.16b,v1.16b,v27.16b + orr v27.16b,v0.16b,v0.16b + aese v28.16b,v3.16b +.inst 0xce017210 //eor3 v16.16b,v16.16b,v1.16b,v28.16b + orr v28.16b,v0.16b,v0.16b + aese v29.16b,v3.16b +.inst 0xce017631 //eor3 v17.16b,v17.16b,v1.16b,v29.16b + orr v29.16b,v0.16b,v0.16b + aese v30.16b,v3.16b +.inst 0xce017a52 //eor3 v18.16b,v18.16b,v1.16b,v30.16b + orr v30.16b,v0.16b,v0.16b + aese v31.16b,v3.16b +.inst 0xce017e73 //eor3 v19.16b,v19.16b,v1.16b,v31.16b + orr v31.16b,v0.16b,v0.16b + aese v8.16b,v3.16b +.inst 0xce012294 //eor3 v20.16b,v20.16b,v1.16b,v8.16b + orr v8.16b,v0.16b,v0.16b + aese v9.16b,v3.16b +.inst 0xce0126b5 //eor3 v21.16b,v21.16b,v1.16b,v9.16b + orr v9.16b,v0.16b,v0.16b + aese v10.16b,v3.16b +.inst 0xce012ad6 //eor3 v22.16b,v22.16b,v1.16b,v10.16b + orr v10.16b,v0.16b,v0.16b + aese v11.16b,v3.16b +.inst 0xce012ef7 //eor3 v23.16b,v23.16b,v1.16b,v11.16b + orr v11.16b,v0.16b,v0.16b + ld1 {v3.4s},[x7],#16 // re-pre-load rndkey[1] + + mov v24.s[3],w9 + mov v25.s[3],w10 + mov v26.s[3],w12 + mov v27.s[3],w11 + st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 + mov v28.s[3],w13 + mov v29.s[3],w14 + mov v30.s[3],w15 + mov v31.s[3],w19 + st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64 + mov v8.s[3],w20 + mov v9.s[3],w21 + mov v10.s[3],w22 + mov v11.s[3],w23 + st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 + + mov w6,w5 + + add w8,w8,#12 + subs x2,x2,#12 + b.hs .Loop12x_ctr32_unroll + + // pop regs from stack when 12 data chunks are interleaved + ldp d10,d11,[sp],#32 + ldp d8,d9,[sp],#32 + ldp x23,x24,[sp],#16 + ldp x21,x22,[sp],#16 + ldp x19,x20,[sp],#16 + + add x2,x2,#12 + cbz x2,.Lctr32_done_unroll + sub w8,w8,#12 + + cmp x2,#2 + b.ls .Lctr32_tail_unroll + + cmp x2,#6 + sub x2,x2,#3 // bias + add w8,w8,#3 + b.lo .Loop3x_ctr32_unroll + + sub x2,x2,#3 + add w8,w8,#3 + b.lo .Loop6x_ctr32_unroll + +.align 4 +.Loop6x_ctr32_unroll: + aese v24.16b,v2.16b + aesmc v24.16b,v24.16b + aese v25.16b,v2.16b + aesmc v25.16b,v25.16b + aese v26.16b,v2.16b + aesmc v26.16b,v26.16b + aese v27.16b,v2.16b + aesmc v27.16b,v27.16b + aese v28.16b,v2.16b + aesmc v28.16b,v28.16b + aese v29.16b,v2.16b + aesmc v29.16b,v29.16b + ld1 {v2.4s},[x7],#16 + subs w6,w6,#2 + aese v24.16b,v3.16b + aesmc v24.16b,v24.16b + aese v25.16b,v3.16b + aesmc v25.16b,v25.16b + aese v26.16b,v3.16b + aesmc v26.16b,v26.16b + aese v27.16b,v3.16b + aesmc v27.16b,v27.16b + aese v28.16b,v3.16b + aesmc v28.16b,v28.16b + aese v29.16b,v3.16b + aesmc v29.16b,v29.16b + ld1 {v3.4s},[x7],#16 + b.gt .Loop6x_ctr32_unroll + + aese v24.16b,v2.16b + aesmc v24.16b,v24.16b + aese v25.16b,v2.16b + aesmc v25.16b,v25.16b + aese v26.16b,v2.16b + aesmc v26.16b,v26.16b + aese v27.16b,v2.16b + aesmc v27.16b,v27.16b + aese v28.16b,v2.16b + aesmc v28.16b,v28.16b + aese v29.16b,v2.16b + aesmc v29.16b,v29.16b + ld1 {v2.4s},[x7],#16 + + aese v24.16b,v3.16b + aesmc v24.16b,v24.16b + aese v25.16b,v3.16b + aesmc v25.16b,v25.16b + aese v26.16b,v3.16b + aesmc v26.16b,v26.16b + aese v27.16b,v3.16b + aesmc v27.16b,v27.16b + aese v28.16b,v3.16b + aesmc v28.16b,v28.16b + aese v29.16b,v3.16b + aesmc v29.16b,v29.16b + ld1 {v3.4s},[x7],#16 + + aese v24.16b,v2.16b + aesmc v24.16b,v24.16b + add w9,w8,#1 + add w10,w8,#2 + aese v25.16b,v2.16b + aesmc v25.16b,v25.16b + add w12,w8,#3 + add w11,w8,#4 + aese v26.16b,v2.16b + aesmc v26.16b,v26.16b + add w13,w8,#5 + add w14,w8,#6 + rev w9,w9 + aese v27.16b,v2.16b + aesmc v27.16b,v27.16b + rev w10,w10 + rev w12,w12 + aese v28.16b,v2.16b + aesmc v28.16b,v28.16b + rev w11,w11 + rev w13,w13 + aese v29.16b,v2.16b + aesmc v29.16b,v29.16b + rev w14,w14 + ld1 {v2.4s},[x7],#16 + + aese v24.16b,v3.16b + aesmc v24.16b,v24.16b + aese v25.16b,v3.16b + aesmc v25.16b,v25.16b + ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 + aese v26.16b,v3.16b + aesmc v26.16b,v26.16b + aese v27.16b,v3.16b + aesmc v27.16b,v27.16b + ld1 {v16.16b,v17.16b},[x0],#32 + aese v28.16b,v3.16b + aesmc v28.16b,v28.16b + aese v29.16b,v3.16b + aesmc v29.16b,v29.16b + ld1 {v3.4s},[x7],#16 + + mov x7, x3 + aese v24.16b,v2.16b + aesmc v24.16b,v24.16b + aese v25.16b,v2.16b + aesmc v25.16b,v25.16b + aese v26.16b,v2.16b + aesmc v26.16b,v26.16b + aese v27.16b,v2.16b + aesmc v27.16b,v27.16b + aese v28.16b,v2.16b + aesmc v28.16b,v28.16b + aese v29.16b,v2.16b + aesmc v29.16b,v29.16b + ld1 {v2.4s},[x7],#16 // re-pre-load rndkey[0] + + aese v24.16b,v3.16b +.inst 0xce016084 //eor3 v4.16b,v4.16b,v1.16b,v24.16b + aese v25.16b,v3.16b +.inst 0xce0164a5 //eor3 v5.16b,v5.16b,v1.16b,v25.16b + aese v26.16b,v3.16b +.inst 0xce0168c6 //eor3 v6.16b,v6.16b,v1.16b,v26.16b + aese v27.16b,v3.16b +.inst 0xce016ce7 //eor3 v7.16b,v7.16b,v1.16b,v27.16b + aese v28.16b,v3.16b +.inst 0xce017210 //eor3 v16.16b,v16.16b,v1.16b,v28.16b + aese v29.16b,v3.16b +.inst 0xce017631 //eor3 v17.16b,v17.16b,v1.16b,v29.16b + ld1 {v3.4s},[x7],#16 // re-pre-load rndkey[1] + + orr v24.16b,v0.16b,v0.16b + orr v25.16b,v0.16b,v0.16b + orr v26.16b,v0.16b,v0.16b + orr v27.16b,v0.16b,v0.16b + orr v28.16b,v0.16b,v0.16b + orr v29.16b,v0.16b,v0.16b + + mov v24.s[3],w9 + mov v25.s[3],w10 + st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 + mov v26.s[3],w12 + mov v27.s[3],w11 + st1 {v16.16b,v17.16b},[x1],#32 + mov v28.s[3],w13 + mov v29.s[3],w14 + + cbz x2,.Lctr32_done_unroll + mov w6,w5 + + cmp x2,#2 + b.ls .Lctr32_tail_unroll + + sub x2,x2,#3 // bias + add w8,w8,#3 + b .Loop3x_ctr32_unroll + +.align 4 +.Loop3x_ctr32_unroll: + aese v24.16b,v2.16b + aesmc v24.16b,v24.16b + aese v25.16b,v2.16b + aesmc v25.16b,v25.16b + aese v26.16b,v2.16b + aesmc v26.16b,v26.16b + ld1 {v2.4s},[x7],#16 + subs w6,w6,#2 + aese v24.16b,v3.16b + aesmc v24.16b,v24.16b + aese v25.16b,v3.16b + aesmc v25.16b,v25.16b + aese v26.16b,v3.16b + aesmc v26.16b,v26.16b + ld1 {v3.4s},[x7],#16 + b.gt .Loop3x_ctr32_unroll + + aese v24.16b,v2.16b + aesmc v9.16b,v24.16b + aese v25.16b,v2.16b + aesmc v10.16b,v25.16b + ld1 {v4.16b,v5.16b,v6.16b},[x0],#48 + orr v24.16b,v0.16b,v0.16b + aese v26.16b,v2.16b + aesmc v26.16b,v26.16b + ld1 {v2.4s},[x7],#16 + orr v25.16b,v0.16b,v0.16b + aese v9.16b,v3.16b + aesmc v9.16b,v9.16b + aese v10.16b,v3.16b + aesmc v10.16b,v10.16b + aese v26.16b,v3.16b + aesmc v11.16b,v26.16b + ld1 {v3.4s},[x7],#16 + orr v26.16b,v0.16b,v0.16b + add w9,w8,#1 + aese v9.16b,v2.16b + aesmc v9.16b,v9.16b + aese v10.16b,v2.16b + aesmc v10.16b,v10.16b + add w10,w8,#2 + aese v11.16b,v2.16b + aesmc v11.16b,v11.16b + ld1 {v2.4s},[x7],#16 + add w8,w8,#3 + aese v9.16b,v3.16b + aesmc v9.16b,v9.16b + aese v10.16b,v3.16b + aesmc v10.16b,v10.16b + + rev w9,w9 + aese v11.16b,v3.16b + aesmc v11.16b,v11.16b + ld1 {v3.4s},[x7],#16 + mov v24.s[3], w9 + mov x7,x3 + rev w10,w10 + aese v9.16b,v2.16b + aesmc v9.16b,v9.16b + + aese v10.16b,v2.16b + aesmc v10.16b,v10.16b + mov v25.s[3], w10 + rev w12,w8 + aese v11.16b,v2.16b + aesmc v11.16b,v11.16b + mov v26.s[3], w12 + + aese v9.16b,v3.16b + aese v10.16b,v3.16b + aese v11.16b,v3.16b + +.inst 0xce012484 //eor3 v4.16b,v4.16b,v1.16b,v9.16b + ld1 {v2.4s},[x7],#16 // re-pre-load rndkey[0] +.inst 0xce0128a5 //eor3 v5.16b,v5.16b,v1.16b,v10.16b + mov w6,w5 +.inst 0xce012cc6 //eor3 v6.16b,v6.16b,v1.16b,v11.16b + ld1 {v3.4s},[x7],#16 // re-pre-load rndkey[1] + st1 {v4.16b,v5.16b,v6.16b},[x1],#48 + + cbz x2,.Lctr32_done_unroll + +.Lctr32_tail_unroll: + cmp x2,#1 + b.eq .Lctr32_tail_1_unroll + +.Lctr32_tail_2_unroll: + aese v24.16b,v2.16b + aesmc v24.16b,v24.16b + aese v25.16b,v2.16b + aesmc v25.16b,v25.16b + ld1 {v2.4s},[x7],#16 + subs w6,w6,#2 + aese v24.16b,v3.16b + aesmc v24.16b,v24.16b + aese v25.16b,v3.16b + aesmc v25.16b,v25.16b + ld1 {v3.4s},[x7],#16 + b.gt .Lctr32_tail_2_unroll + + aese v24.16b,v2.16b + aesmc v24.16b,v24.16b + aese v25.16b,v2.16b + aesmc v25.16b,v25.16b + ld1 {v2.4s},[x7],#16 + aese v24.16b,v3.16b + aesmc v24.16b,v24.16b + aese v25.16b,v3.16b + aesmc v25.16b,v25.16b + ld1 {v3.4s},[x7],#16 + ld1 {v4.16b,v5.16b},[x0],#32 + aese v24.16b,v2.16b + aesmc v24.16b,v24.16b + aese v25.16b,v2.16b + aesmc v25.16b,v25.16b + ld1 {v2.4s},[x7],#16 + aese v24.16b,v3.16b + aesmc v24.16b,v24.16b + aese v25.16b,v3.16b + aesmc v25.16b,v25.16b + ld1 {v3.4s},[x7],#16 + aese v24.16b,v2.16b + aesmc v24.16b,v24.16b + aese v25.16b,v2.16b + aesmc v25.16b,v25.16b + aese v24.16b,v3.16b + aese v25.16b,v3.16b + +.inst 0xce016084 //eor3 v4.16b,v4.16b,v1.16b,v24.16b +.inst 0xce0164a5 //eor3 v5.16b,v5.16b,v1.16b,v25.16b + st1 {v4.16b,v5.16b},[x1],#32 + b .Lctr32_done_unroll + +.Lctr32_tail_1_unroll: + aese v24.16b,v2.16b + aesmc v24.16b,v24.16b + ld1 {v2.4s},[x7],#16 + subs w6,w6,#2 + aese v24.16b,v3.16b + aesmc v24.16b,v24.16b + ld1 {v3.4s},[x7],#16 + b.gt .Lctr32_tail_1_unroll + + aese v24.16b,v2.16b + aesmc v24.16b,v24.16b + ld1 {v2.4s},[x7],#16 + aese v24.16b,v3.16b + aesmc v24.16b,v24.16b + ld1 {v3.4s},[x7],#16 + ld1 {v4.16b},[x0] + aese v24.16b,v2.16b + aesmc v24.16b,v24.16b + ld1 {v2.4s},[x7],#16 + aese v24.16b,v3.16b + aesmc v24.16b,v24.16b + ld1 {v3.4s},[x7],#16 + aese v24.16b,v2.16b + aesmc v24.16b,v24.16b + aese v24.16b,v3.16b + +.inst 0xce016084 //eor3 v4.16b,v4.16b,v1.16b,v24.16b + st1 {v4.16b},[x1],#16 + +.Lctr32_done_unroll: + ldp d8,d9,[sp, #16] + ldp d10,d11,[sp, #32] + ldp d12,d13,[sp, #48] + ldp d14,d15,[sp, #64] + ldr x29,[sp],#80 + ret +.size aes_v8_ctr32_encrypt_blocks_unroll12_eor3,.-aes_v8_ctr32_encrypt_blocks_unroll12_eor3 .globl aes_v8_ctr32_encrypt_blocks .type aes_v8_ctr32_encrypt_blocks,%function .align 5 @@ -3116,7 +3841,7 @@ aes_v8_xts_decrypt: cbnz x2,.Lxts_dec_1st_done ld1 {v0.16b},[x0],#16 - // Decrypt the last secod block to get the last plain text block + // Decrypt the last second block to get the last plain text block .Lxts_dec_1st_done: eor v26.16b,v0.16b,v8.16b ldr w6,[x3,#240] diff --git a/sys/crypto/openssl/aarch64/arm64cpuid.S b/sys/crypto/openssl/aarch64/arm64cpuid.S index 52c6ee5b65d3..81530bda1c67 100644 --- a/sys/crypto/openssl/aarch64/arm64cpuid.S +++ b/sys/crypto/openssl/aarch64/arm64cpuid.S @@ -57,14 +57,46 @@ _armv8_pmull_probe: ret .size _armv8_pmull_probe,.-_armv8_pmull_probe +.globl _armv8_sm4_probe +.type _armv8_sm4_probe,%function +_armv8_sm4_probe: + AARCH64_VALID_CALL_TARGET +.inst 0xcec08400 // sm4e v0.4s, v0.4s + ret +.size _armv8_sm4_probe,.-_armv8_sm4_probe + .globl _armv8_sha512_probe .type _armv8_sha512_probe,%function _armv8_sha512_probe: AARCH64_VALID_CALL_TARGET -.long 0xcec08000 // sha512su0 v0.2d,v0.2d +.inst 0xcec08000 // sha512su0 v0.2d,v0.2d ret .size _armv8_sha512_probe,.-_armv8_sha512_probe +.globl _armv8_eor3_probe +.type _armv8_eor3_probe,%function +_armv8_eor3_probe: + AARCH64_VALID_CALL_TARGET +.inst 0xce010800 // eor3 v0.16b, v0.16b, v1.16b, v2.16b + ret +.size _armv8_eor3_probe,.-_armv8_eor3_probe + +.globl _armv8_sve_probe +.type _armv8_sve_probe,%function +_armv8_sve_probe: + AARCH64_VALID_CALL_TARGET +.inst 0x04a03000 // eor z0.d,z0.d,z0.d + ret +.size _armv8_sve_probe,.-_armv8_sve_probe + +.globl _armv8_sve2_probe +.type _armv8_sve2_probe,%function +_armv8_sve2_probe: + AARCH64_VALID_CALL_TARGET +.inst 0x04e03400 // xar z0.d,z0.d,z0.d + ret +.size _armv8_sve2_probe,.-_armv8_sve2_probe + .globl _armv8_cpuid_probe .type _armv8_cpuid_probe,%function _armv8_cpuid_probe: @@ -73,6 +105,14 @@ _armv8_cpuid_probe: ret .size _armv8_cpuid_probe,.-_armv8_cpuid_probe +.globl _armv8_sm3_probe +.type _armv8_sm3_probe,%function +_armv8_sm3_probe: + AARCH64_VALID_CALL_TARGET +.inst 0xce63c004 // sm3partw1 v4.4s, v0.4s, v3.4s + ret +.size _armv8_sm3_probe,.-_armv8_sm3_probe + .globl OPENSSL_cleanse .type OPENSSL_cleanse,%function .align 5 @@ -138,3 +178,98 @@ CRYPTO_memcmp: lsr w0,w0,#31 ret .size CRYPTO_memcmp,.-CRYPTO_memcmp + +.globl _armv8_rng_probe +.type _armv8_rng_probe,%function +_armv8_rng_probe: + AARCH64_VALID_CALL_TARGET + mrs x0, s3_3_c2_c4_0 // rndr + mrs x0, s3_3_c2_c4_1 // rndrrs + ret +.size _armv8_rng_probe,.-_armv8_rng_probe +// Fill buffer with Randomly Generated Bytes +// inputs: char * in x0 - Pointer to buffer +// size_t in x1 - Number of bytes to write to buffer +// outputs: size_t in x0 - Number of bytes successfully written to buffer +.globl OPENSSL_rndr_asm +.type OPENSSL_rndr_asm,%function +.align 4 +OPENSSL_rndr_asm: + AARCH64_VALID_CALL_TARGET + mov x2,xzr + mov x3,xzr + +.align 4 +.Loop_rndr: + cmp x1,#0 + b.eq .rndr_done + mov x3,xzr + mrs x3,s3_3_c2_c4_0 + b.eq .rndr_done + + cmp x1,#8 + b.lt .Loop_single_byte_rndr + + str x3,[x0] + add x0,x0,#8 + add x2,x2,#8 + subs x1,x1,#8 + b.ge .Loop_rndr + +.align 4 +.Loop_single_byte_rndr: + strb w3,[x0] + lsr x3,x3,#8 + add x2,x2,#1 + add x0,x0,#1 + subs x1,x1,#1 + b.gt .Loop_single_byte_rndr + +.align 4 +.rndr_done: + mov x0,x2 + ret +.size OPENSSL_rndr_asm,.-OPENSSL_rndr_asm +// Fill buffer with Randomly Generated Bytes +// inputs: char * in x0 - Pointer to buffer +// size_t in x1 - Number of bytes to write to buffer +// outputs: size_t in x0 - Number of bytes successfully written to buffer +.globl OPENSSL_rndrrs_asm +.type OPENSSL_rndrrs_asm,%function +.align 4 +OPENSSL_rndrrs_asm: + AARCH64_VALID_CALL_TARGET + mov x2,xzr + mov x3,xzr + +.align 4 +.Loop_rndrrs: + cmp x1,#0 + b.eq .rndrrs_done + mov x3,xzr + mrs x3,s3_3_c2_c4_1 + b.eq .rndrrs_done + + cmp x1,#8 + b.lt .Loop_single_byte_rndrrs + + str x3,[x0] + add x0,x0,#8 + add x2,x2,#8 + subs x1,x1,#8 + b.ge .Loop_rndrrs + +.align 4 +.Loop_single_byte_rndrrs: + strb w3,[x0] + lsr x3,x3,#8 + add x2,x2,#1 + add x0,x0,#1 + subs x1,x1,#1 + b.gt .Loop_single_byte_rndrrs + +.align 4 +.rndrrs_done: + mov x0,x2 + ret +.size OPENSSL_rndrrs_asm,.-OPENSSL_rndrrs_asm diff --git a/sys/crypto/openssl/aarch64/armv8-mont.S b/sys/crypto/openssl/aarch64/armv8-mont.S index b429f39ee326..a12dcf3dcfc0 100644 --- a/sys/crypto/openssl/aarch64/armv8-mont.S +++ b/sys/crypto/openssl/aarch64/armv8-mont.S @@ -2131,6 +2131,7 @@ __bn_mul4x_mont: AARCH64_VALIDATE_LINK_REGISTER ret .size __bn_mul4x_mont,.-__bn_mul4x_mont +.section .rodata .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 2 .align 4 diff --git a/sys/crypto/openssl/aarch64/bsaes-armv8.S b/sys/crypto/openssl/aarch64/bsaes-armv8.S new file mode 100644 index 000000000000..cd43f2db7e21 --- /dev/null +++ b/sys/crypto/openssl/aarch64/bsaes-armv8.S @@ -0,0 +1,2356 @@ +/* Do not modify. This file is auto-generated from bsaes-armv8.pl. */ +// Copyright 2021-2025 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the OpenSSL license (the "License"). You may not use +// this file except in compliance with the License. You can obtain a copy +// in the file LICENSE in the source distribution or at +// https://www.openssl.org/source/license.html +// +// ==================================================================== +// Written by Ben Avison <bavison@riscosopen.org> for the OpenSSL +// project. Rights for redistribution and usage in source and binary +// forms are granted according to the OpenSSL license. +// ==================================================================== +// +// This implementation is a translation of bsaes-armv7 for AArch64. +// No attempt has been made to carry across the build switches for +// kernel targets, since the Linux kernel crypto support has moved on +// from when it was based on OpenSSL. + +// A lot of hand-scheduling has been performed. Consequently, this code +// doesn't factor out neatly into macros in the same way that the +// AArch32 version did, and there is little to be gained by wrapping it +// up in Perl, and it is presented as pure assembly. + + +#include "crypto/arm_arch.h" + +.text + + + + + +.type _bsaes_decrypt8,%function +.align 4 +// On entry: +// x9 -> key (previously expanded using _bsaes_key_convert) +// x10 = number of rounds +// v0-v7 input data +// On exit: +// x9-x11 corrupted +// other general-purpose registers preserved +// v0-v7 output data +// v11-v15 preserved +// other SIMD registers corrupted +_bsaes_decrypt8: + ldr q8, [x9], #16 + adrp x11, .LM0ISR + add x11, x11, #:lo12:.LM0ISR + movi v9.16b, #0x55 + ldr q10, [x11], #16 + movi v16.16b, #0x33 + movi v17.16b, #0x0f + sub x10, x10, #1 + eor v0.16b, v0.16b, v8.16b + eor v1.16b, v1.16b, v8.16b + eor v2.16b, v2.16b, v8.16b + eor v4.16b, v4.16b, v8.16b + eor v3.16b, v3.16b, v8.16b + eor v5.16b, v5.16b, v8.16b + tbl v0.16b, {v0.16b}, v10.16b + tbl v1.16b, {v1.16b}, v10.16b + tbl v2.16b, {v2.16b}, v10.16b + tbl v4.16b, {v4.16b}, v10.16b + eor v6.16b, v6.16b, v8.16b + eor v7.16b, v7.16b, v8.16b + tbl v3.16b, {v3.16b}, v10.16b + tbl v5.16b, {v5.16b}, v10.16b + tbl v6.16b, {v6.16b}, v10.16b + ushr v8.2d, v0.2d, #1 + tbl v7.16b, {v7.16b}, v10.16b + ushr v10.2d, v4.2d, #1 + ushr v18.2d, v2.2d, #1 + eor v8.16b, v8.16b, v1.16b + ushr v19.2d, v6.2d, #1 + eor v10.16b, v10.16b, v5.16b + eor v18.16b, v18.16b, v3.16b + and v8.16b, v8.16b, v9.16b + eor v19.16b, v19.16b, v7.16b + and v10.16b, v10.16b, v9.16b + and v18.16b, v18.16b, v9.16b + eor v1.16b, v1.16b, v8.16b + shl v8.2d, v8.2d, #1 + and v9.16b, v19.16b, v9.16b + eor v5.16b, v5.16b, v10.16b + shl v10.2d, v10.2d, #1 + eor v3.16b, v3.16b, v18.16b + shl v18.2d, v18.2d, #1 + eor v0.16b, v0.16b, v8.16b + shl v8.2d, v9.2d, #1 + eor v7.16b, v7.16b, v9.16b + eor v4.16b, v4.16b, v10.16b + eor v2.16b, v2.16b, v18.16b + ushr v9.2d, v1.2d, #2 + eor v6.16b, v6.16b, v8.16b + ushr v8.2d, v0.2d, #2 + ushr v10.2d, v5.2d, #2 + ushr v18.2d, v4.2d, #2 + eor v9.16b, v9.16b, v3.16b + eor v8.16b, v8.16b, v2.16b + eor v10.16b, v10.16b, v7.16b + eor v18.16b, v18.16b, v6.16b + and v9.16b, v9.16b, v16.16b + and v8.16b, v8.16b, v16.16b + and v10.16b, v10.16b, v16.16b + and v16.16b, v18.16b, v16.16b + eor v3.16b, v3.16b, v9.16b + shl v9.2d, v9.2d, #2 + eor v2.16b, v2.16b, v8.16b + shl v8.2d, v8.2d, #2 + eor v7.16b, v7.16b, v10.16b + shl v10.2d, v10.2d, #2 + eor v6.16b, v6.16b, v16.16b + shl v16.2d, v16.2d, #2 + eor v1.16b, v1.16b, v9.16b + eor v0.16b, v0.16b, v8.16b + eor v5.16b, v5.16b, v10.16b + eor v4.16b, v4.16b, v16.16b + ushr v8.2d, v3.2d, #4 + ushr v9.2d, v2.2d, #4 + ushr v10.2d, v1.2d, #4 + ushr v16.2d, v0.2d, #4 + eor v8.16b, v8.16b, v7.16b + eor v9.16b, v9.16b, v6.16b + eor v10.16b, v10.16b, v5.16b + eor v16.16b, v16.16b, v4.16b + and v8.16b, v8.16b, v17.16b + and v9.16b, v9.16b, v17.16b + and v10.16b, v10.16b, v17.16b + and v16.16b, v16.16b, v17.16b + eor v7.16b, v7.16b, v8.16b + shl v8.2d, v8.2d, #4 + eor v6.16b, v6.16b, v9.16b + shl v9.2d, v9.2d, #4 + eor v5.16b, v5.16b, v10.16b + shl v10.2d, v10.2d, #4 + eor v4.16b, v4.16b, v16.16b + shl v16.2d, v16.2d, #4 + eor v3.16b, v3.16b, v8.16b + eor v2.16b, v2.16b, v9.16b + eor v1.16b, v1.16b, v10.16b + eor v0.16b, v0.16b, v16.16b + b .Ldec_sbox +.align 4 +.Ldec_loop: + ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x9], #64 + ldp q8, q9, [x9], #32 + eor v0.16b, v16.16b, v0.16b + ldr q10, [x9], #16 + eor v1.16b, v17.16b, v1.16b + ldr q16, [x9], #16 + eor v2.16b, v18.16b, v2.16b + eor v3.16b, v19.16b, v3.16b + eor v4.16b, v8.16b, v4.16b + eor v5.16b, v9.16b, v5.16b + eor v6.16b, v10.16b, v6.16b + eor v7.16b, v16.16b, v7.16b + tbl v0.16b, {v0.16b}, v28.16b + tbl v1.16b, {v1.16b}, v28.16b + tbl v2.16b, {v2.16b}, v28.16b + tbl v3.16b, {v3.16b}, v28.16b + tbl v4.16b, {v4.16b}, v28.16b + tbl v5.16b, {v5.16b}, v28.16b + tbl v6.16b, {v6.16b}, v28.16b + tbl v7.16b, {v7.16b}, v28.16b +.Ldec_sbox: + eor v1.16b, v1.16b, v4.16b + eor v3.16b, v3.16b, v4.16b + subs x10, x10, #1 + eor v4.16b, v4.16b, v7.16b + eor v2.16b, v2.16b, v7.16b + eor v1.16b, v1.16b, v6.16b + eor v6.16b, v6.16b, v4.16b + eor v2.16b, v2.16b, v5.16b + eor v0.16b, v0.16b, v1.16b + eor v7.16b, v7.16b, v6.16b + eor v8.16b, v6.16b, v2.16b + and v9.16b, v4.16b, v6.16b + eor v10.16b, v2.16b, v6.16b + eor v3.16b, v3.16b, v0.16b + eor v5.16b, v5.16b, v0.16b + eor v16.16b, v7.16b, v4.16b + eor v17.16b, v4.16b, v0.16b + and v18.16b, v0.16b, v2.16b + eor v19.16b, v7.16b, v4.16b + eor v1.16b, v1.16b, v3.16b + eor v20.16b, v3.16b, v0.16b + eor v21.16b, v5.16b, v2.16b + eor v22.16b, v3.16b, v7.16b + and v8.16b, v17.16b, v8.16b + orr v17.16b, v3.16b, v5.16b + eor v23.16b, v1.16b, v6.16b + eor v24.16b, v20.16b, v16.16b + eor v25.16b, v1.16b, v5.16b + orr v26.16b, v20.16b, v21.16b + and v20.16b, v20.16b, v21.16b + and v27.16b, v7.16b, v1.16b + eor v21.16b, v21.16b, v23.16b + orr v28.16b, v16.16b, v23.16b + orr v29.16b, v22.16b, v25.16b + eor v26.16b, v26.16b, v8.16b + and v16.16b, v16.16b, v23.16b + and v22.16b, v22.16b, v25.16b + and v21.16b, v24.16b, v21.16b + eor v8.16b, v28.16b, v8.16b + eor v23.16b, v5.16b, v2.16b + eor v24.16b, v1.16b, v6.16b + eor v16.16b, v16.16b, v22.16b + eor v22.16b, v3.16b, v0.16b + eor v25.16b, v29.16b, v21.16b + eor v21.16b, v26.16b, v21.16b + eor v8.16b, v8.16b, v20.16b + eor v26.16b, v23.16b, v24.16b + eor v16.16b, v16.16b, v20.16b + eor v28.16b, v22.16b, v19.16b + eor v20.16b, v25.16b, v20.16b + eor v9.16b, v21.16b, v9.16b + eor v8.16b, v8.16b, v18.16b + eor v18.16b, v5.16b, v1.16b + eor v21.16b, v16.16b, v17.16b + eor v16.16b, v16.16b, v17.16b + eor v17.16b, v20.16b, v27.16b + eor v20.16b, v3.16b, v7.16b + eor v25.16b, v9.16b, v8.16b + eor v27.16b, v0.16b, v4.16b + and v29.16b, v9.16b, v17.16b + eor v30.16b, v8.16b, v29.16b + eor v31.16b, v21.16b, v29.16b + eor v29.16b, v21.16b, v29.16b + bsl v30.16b, v17.16b, v21.16b + bsl v31.16b, v9.16b, v8.16b + bsl v16.16b, v30.16b, v29.16b + bsl v21.16b, v29.16b, v30.16b + eor v8.16b, v31.16b, v30.16b + and v1.16b, v1.16b, v31.16b + and v9.16b, v16.16b, v31.16b + and v6.16b, v6.16b, v30.16b + eor v16.16b, v17.16b, v21.16b + and v4.16b, v4.16b, v30.16b + eor v17.16b, v8.16b, v30.16b + and v21.16b, v24.16b, v8.16b + eor v9.16b, v9.16b, v25.16b + and v19.16b, v19.16b, v8.16b + eor v24.16b, v30.16b, v16.16b + eor v25.16b, v30.16b, v16.16b + and v7.16b, v7.16b, v17.16b + and v10.16b, v10.16b, v16.16b + eor v29.16b, v9.16b, v16.16b + eor v30.16b, v31.16b, v9.16b + and v0.16b, v24.16b, v0.16b + and v9.16b, v18.16b, v9.16b + and v2.16b, v25.16b, v2.16b + eor v10.16b, v10.16b, v6.16b + eor v18.16b, v29.16b, v16.16b + and v5.16b, v30.16b, v5.16b + eor v24.16b, v8.16b, v29.16b + and v25.16b, v26.16b, v29.16b + and v26.16b, v28.16b, v29.16b + eor v8.16b, v8.16b, v29.16b + eor v17.16b, v17.16b, v18.16b + eor v5.16b, v1.16b, v5.16b + and v23.16b, v24.16b, v23.16b + eor v21.16b, v21.16b, v25.16b + eor v19.16b, v19.16b, v26.16b + eor v0.16b, v4.16b, v0.16b + and v3.16b, v17.16b, v3.16b + eor v1.16b, v9.16b, v1.16b + eor v9.16b, v25.16b, v23.16b + eor v5.16b, v5.16b, v21.16b + eor v2.16b, v6.16b, v2.16b + and v6.16b, v8.16b, v22.16b + eor v3.16b, v7.16b, v3.16b + and v8.16b, v20.16b, v18.16b + eor v10.16b, v10.16b, v9.16b + eor v0.16b, v0.16b, v19.16b + eor v9.16b, v1.16b, v9.16b + eor v1.16b, v2.16b, v21.16b + eor v3.16b, v3.16b, v19.16b + and v16.16b, v27.16b, v16.16b + eor v17.16b, v26.16b, v6.16b + eor v6.16b, v8.16b, v7.16b + eor v7.16b, v1.16b, v9.16b + eor v1.16b, v5.16b, v3.16b + eor v2.16b, v10.16b, v3.16b + eor v4.16b, v16.16b, v4.16b + eor v8.16b, v6.16b, v17.16b + eor v5.16b, v9.16b, v3.16b + eor v9.16b, v0.16b, v1.16b + eor v6.16b, v7.16b, v1.16b + eor v0.16b, v4.16b, v17.16b + eor v4.16b, v8.16b, v7.16b + eor v7.16b, v9.16b, v2.16b + eor v8.16b, v3.16b, v0.16b + eor v7.16b, v7.16b, v5.16b + eor v3.16b, v4.16b, v7.16b + eor v4.16b, v7.16b, v0.16b + eor v7.16b, v8.16b, v3.16b + bcc .Ldec_done + ext v8.16b, v0.16b, v0.16b, #8 + ext v9.16b, v1.16b, v1.16b, #8 + ldr q28, [x11] // load from .LISR in common case (x10 > 0) + ext v10.16b, v6.16b, v6.16b, #8 + ext v16.16b, v3.16b, v3.16b, #8 + ext v17.16b, v5.16b, v5.16b, #8 + ext v18.16b, v4.16b, v4.16b, #8 + eor v8.16b, v8.16b, v0.16b + eor v9.16b, v9.16b, v1.16b + eor v10.16b, v10.16b, v6.16b + eor v16.16b, v16.16b, v3.16b + eor v17.16b, v17.16b, v5.16b + ext v19.16b, v2.16b, v2.16b, #8 + ext v20.16b, v7.16b, v7.16b, #8 + eor v18.16b, v18.16b, v4.16b + eor v6.16b, v6.16b, v8.16b + eor v8.16b, v2.16b, v10.16b + eor v4.16b, v4.16b, v9.16b + eor v2.16b, v19.16b, v2.16b + eor v9.16b, v20.16b, v7.16b + eor v0.16b, v0.16b, v16.16b + eor v1.16b, v1.16b, v16.16b + eor v6.16b, v6.16b, v17.16b + eor v8.16b, v8.16b, v16.16b + eor v7.16b, v7.16b, v18.16b + eor v4.16b, v4.16b, v16.16b + eor v2.16b, v3.16b, v2.16b + eor v1.16b, v1.16b, v17.16b + eor v3.16b, v5.16b, v9.16b + eor v5.16b, v8.16b, v17.16b + eor v7.16b, v7.16b, v17.16b + ext v8.16b, v0.16b, v0.16b, #12 + ext v9.16b, v6.16b, v6.16b, #12 + ext v10.16b, v4.16b, v4.16b, #12 + ext v16.16b, v1.16b, v1.16b, #12 + ext v17.16b, v5.16b, v5.16b, #12 + ext v18.16b, v7.16b, v7.16b, #12 + eor v0.16b, v0.16b, v8.16b + eor v6.16b, v6.16b, v9.16b + eor v4.16b, v4.16b, v10.16b + ext v19.16b, v2.16b, v2.16b, #12 + ext v20.16b, v3.16b, v3.16b, #12 + eor v1.16b, v1.16b, v16.16b + eor v5.16b, v5.16b, v17.16b + eor v7.16b, v7.16b, v18.16b + eor v2.16b, v2.16b, v19.16b + eor v16.16b, v16.16b, v0.16b + eor v3.16b, v3.16b, v20.16b + eor v17.16b, v17.16b, v4.16b + eor v10.16b, v10.16b, v6.16b + ext v0.16b, v0.16b, v0.16b, #8 + eor v9.16b, v9.16b, v1.16b + ext v1.16b, v1.16b, v1.16b, #8 + eor v8.16b, v8.16b, v3.16b + eor v16.16b, v16.16b, v3.16b + eor v18.16b, v18.16b, v5.16b + eor v19.16b, v19.16b, v7.16b + ext v21.16b, v5.16b, v5.16b, #8 + ext v5.16b, v7.16b, v7.16b, #8 + eor v7.16b, v20.16b, v2.16b + ext v4.16b, v4.16b, v4.16b, #8 + ext v20.16b, v3.16b, v3.16b, #8 + eor v17.16b, v17.16b, v3.16b + ext v2.16b, v2.16b, v2.16b, #8 + eor v3.16b, v10.16b, v3.16b + ext v10.16b, v6.16b, v6.16b, #8 + eor v0.16b, v0.16b, v8.16b + eor v1.16b, v1.16b, v16.16b + eor v5.16b, v5.16b, v18.16b + eor v3.16b, v3.16b, v4.16b + eor v7.16b, v20.16b, v7.16b + eor v6.16b, v2.16b, v19.16b + eor v4.16b, v21.16b, v17.16b + eor v2.16b, v10.16b, v9.16b + bne .Ldec_loop + ldr q28, [x11, #16]! // load from .LISRM0 on last round (x10 == 0) + b .Ldec_loop +.align 4 +.Ldec_done: + ushr v8.2d, v0.2d, #1 + movi v9.16b, #0x55 + ldr q10, [x9] + ushr v16.2d, v2.2d, #1 + movi v17.16b, #0x33 + ushr v18.2d, v6.2d, #1 + movi v19.16b, #0x0f + eor v8.16b, v8.16b, v1.16b + ushr v20.2d, v3.2d, #1 + eor v16.16b, v16.16b, v7.16b + eor v18.16b, v18.16b, v4.16b + and v8.16b, v8.16b, v9.16b + eor v20.16b, v20.16b, v5.16b + and v16.16b, v16.16b, v9.16b + and v18.16b, v18.16b, v9.16b + shl v21.2d, v8.2d, #1 + eor v1.16b, v1.16b, v8.16b + and v8.16b, v20.16b, v9.16b + eor v7.16b, v7.16b, v16.16b + shl v9.2d, v16.2d, #1 + eor v4.16b, v4.16b, v18.16b + shl v16.2d, v18.2d, #1 + eor v0.16b, v0.16b, v21.16b + shl v18.2d, v8.2d, #1 + eor v5.16b, v5.16b, v8.16b + eor v2.16b, v2.16b, v9.16b + eor v6.16b, v6.16b, v16.16b + ushr v8.2d, v1.2d, #2 + eor v3.16b, v3.16b, v18.16b + ushr v9.2d, v0.2d, #2 + ushr v16.2d, v7.2d, #2 + ushr v18.2d, v2.2d, #2 + eor v8.16b, v8.16b, v4.16b + eor v9.16b, v9.16b, v6.16b + eor v16.16b, v16.16b, v5.16b + eor v18.16b, v18.16b, v3.16b + and v8.16b, v8.16b, v17.16b + and v9.16b, v9.16b, v17.16b + and v16.16b, v16.16b, v17.16b + and v17.16b, v18.16b, v17.16b + eor v4.16b, v4.16b, v8.16b + shl v8.2d, v8.2d, #2 + eor v6.16b, v6.16b, v9.16b + shl v9.2d, v9.2d, #2 + eor v5.16b, v5.16b, v16.16b + shl v16.2d, v16.2d, #2 + eor v3.16b, v3.16b, v17.16b + shl v17.2d, v17.2d, #2 + eor v1.16b, v1.16b, v8.16b + eor v0.16b, v0.16b, v9.16b + eor v7.16b, v7.16b, v16.16b + eor v2.16b, v2.16b, v17.16b + ushr v8.2d, v4.2d, #4 + ushr v9.2d, v6.2d, #4 + ushr v16.2d, v1.2d, #4 + ushr v17.2d, v0.2d, #4 + eor v8.16b, v8.16b, v5.16b + eor v9.16b, v9.16b, v3.16b + eor v16.16b, v16.16b, v7.16b + eor v17.16b, v17.16b, v2.16b + and v8.16b, v8.16b, v19.16b + and v9.16b, v9.16b, v19.16b + and v16.16b, v16.16b, v19.16b + and v17.16b, v17.16b, v19.16b + eor v5.16b, v5.16b, v8.16b + shl v8.2d, v8.2d, #4 + eor v3.16b, v3.16b, v9.16b + shl v9.2d, v9.2d, #4 + eor v7.16b, v7.16b, v16.16b + shl v16.2d, v16.2d, #4 + eor v2.16b, v2.16b, v17.16b + shl v17.2d, v17.2d, #4 + eor v4.16b, v4.16b, v8.16b + eor v6.16b, v6.16b, v9.16b + eor v7.16b, v7.16b, v10.16b + eor v1.16b, v1.16b, v16.16b + eor v2.16b, v2.16b, v10.16b + eor v0.16b, v0.16b, v17.16b + eor v4.16b, v4.16b, v10.16b + eor v6.16b, v6.16b, v10.16b + eor v3.16b, v3.16b, v10.16b + eor v5.16b, v5.16b, v10.16b + eor v1.16b, v1.16b, v10.16b + eor v0.16b, v0.16b, v10.16b + ret +.size _bsaes_decrypt8,.-_bsaes_decrypt8 + +.section .rodata +.type _bsaes_consts,%object +.align 6 +_bsaes_consts: +// InvShiftRows constants +// Used in _bsaes_decrypt8, which assumes contiguity +// .LM0ISR used with round 0 key +// .LISR used with middle round keys +// .LISRM0 used with final round key +.LM0ISR: +.quad 0x0a0e0206070b0f03, 0x0004080c0d010509 +.LISR: +.quad 0x0504070602010003, 0x0f0e0d0c080b0a09 +.LISRM0: +.quad 0x01040b0e0205080f, 0x0306090c00070a0d + +// ShiftRows constants +// Used in _bsaes_encrypt8, which assumes contiguity +// .LM0SR used with round 0 key +// .LSR used with middle round keys +// .LSRM0 used with final round key +.LM0SR: +.quad 0x0a0e02060f03070b, 0x0004080c05090d01 +.LSR: +.quad 0x0504070600030201, 0x0f0e0d0c0a09080b +.LSRM0: +.quad 0x0304090e00050a0f, 0x01060b0c0207080d + +.LM0_bigendian: +.quad 0x02060a0e03070b0f, 0x0004080c0105090d +.LM0_littleendian: +.quad 0x0105090d0004080c, 0x03070b0f02060a0e + +// Used in ossl_bsaes_ctr32_encrypt_blocks, prior to dropping into +// _bsaes_encrypt8_alt, for round 0 key in place of .LM0SR +.LREVM0SR: +.quad 0x090d01050c000408, 0x03070b0f060a0e02 + +.align 6 +.size _bsaes_consts,.-_bsaes_consts + +.previous + +.type _bsaes_encrypt8,%function +.align 4 +// On entry: +// x9 -> key (previously expanded using _bsaes_key_convert) +// x10 = number of rounds +// v0-v7 input data +// On exit: +// x9-x11 corrupted +// other general-purpose registers preserved +// v0-v7 output data +// v11-v15 preserved +// other SIMD registers corrupted +_bsaes_encrypt8: + ldr q8, [x9], #16 + adrp x11, .LM0SR + add x11, x11, #:lo12:.LM0SR + ldr q9, [x11], #16 +_bsaes_encrypt8_alt: + eor v0.16b, v0.16b, v8.16b + eor v1.16b, v1.16b, v8.16b + sub x10, x10, #1 + eor v2.16b, v2.16b, v8.16b + eor v4.16b, v4.16b, v8.16b + eor v3.16b, v3.16b, v8.16b + eor v5.16b, v5.16b, v8.16b + tbl v0.16b, {v0.16b}, v9.16b + tbl v1.16b, {v1.16b}, v9.16b + tbl v2.16b, {v2.16b}, v9.16b + tbl v4.16b, {v4.16b}, v9.16b + eor v6.16b, v6.16b, v8.16b + eor v7.16b, v7.16b, v8.16b + tbl v3.16b, {v3.16b}, v9.16b + tbl v5.16b, {v5.16b}, v9.16b + tbl v6.16b, {v6.16b}, v9.16b + ushr v8.2d, v0.2d, #1 + movi v10.16b, #0x55 + tbl v7.16b, {v7.16b}, v9.16b + ushr v9.2d, v4.2d, #1 + movi v16.16b, #0x33 + ushr v17.2d, v2.2d, #1 + eor v8.16b, v8.16b, v1.16b + movi v18.16b, #0x0f + ushr v19.2d, v6.2d, #1 + eor v9.16b, v9.16b, v5.16b + eor v17.16b, v17.16b, v3.16b + and v8.16b, v8.16b, v10.16b + eor v19.16b, v19.16b, v7.16b + and v9.16b, v9.16b, v10.16b + and v17.16b, v17.16b, v10.16b + eor v1.16b, v1.16b, v8.16b + shl v8.2d, v8.2d, #1 + and v10.16b, v19.16b, v10.16b + eor v5.16b, v5.16b, v9.16b + shl v9.2d, v9.2d, #1 + eor v3.16b, v3.16b, v17.16b + shl v17.2d, v17.2d, #1 + eor v0.16b, v0.16b, v8.16b + shl v8.2d, v10.2d, #1 + eor v7.16b, v7.16b, v10.16b + eor v4.16b, v4.16b, v9.16b + eor v2.16b, v2.16b, v17.16b + ushr v9.2d, v1.2d, #2 + eor v6.16b, v6.16b, v8.16b + ushr v8.2d, v0.2d, #2 + ushr v10.2d, v5.2d, #2 + ushr v17.2d, v4.2d, #2 + eor v9.16b, v9.16b, v3.16b + eor v8.16b, v8.16b, v2.16b + eor v10.16b, v10.16b, v7.16b + eor v17.16b, v17.16b, v6.16b + and v9.16b, v9.16b, v16.16b + and v8.16b, v8.16b, v16.16b + and v10.16b, v10.16b, v16.16b + and v16.16b, v17.16b, v16.16b + eor v3.16b, v3.16b, v9.16b + shl v9.2d, v9.2d, #2 + eor v2.16b, v2.16b, v8.16b + shl v8.2d, v8.2d, #2 + eor v7.16b, v7.16b, v10.16b + shl v10.2d, v10.2d, #2 + eor v6.16b, v6.16b, v16.16b + shl v16.2d, v16.2d, #2 + eor v1.16b, v1.16b, v9.16b + eor v0.16b, v0.16b, v8.16b + eor v5.16b, v5.16b, v10.16b + eor v4.16b, v4.16b, v16.16b + ushr v8.2d, v3.2d, #4 + ushr v9.2d, v2.2d, #4 + ushr v10.2d, v1.2d, #4 + ushr v16.2d, v0.2d, #4 + eor v8.16b, v8.16b, v7.16b + eor v9.16b, v9.16b, v6.16b + eor v10.16b, v10.16b, v5.16b + eor v16.16b, v16.16b, v4.16b + and v8.16b, v8.16b, v18.16b + and v9.16b, v9.16b, v18.16b + and v10.16b, v10.16b, v18.16b + and v16.16b, v16.16b, v18.16b + eor v7.16b, v7.16b, v8.16b + shl v8.2d, v8.2d, #4 + eor v6.16b, v6.16b, v9.16b + shl v9.2d, v9.2d, #4 + eor v5.16b, v5.16b, v10.16b + shl v10.2d, v10.2d, #4 + eor v4.16b, v4.16b, v16.16b + shl v16.2d, v16.2d, #4 + eor v3.16b, v3.16b, v8.16b + eor v2.16b, v2.16b, v9.16b + eor v1.16b, v1.16b, v10.16b + eor v0.16b, v0.16b, v16.16b + b .Lenc_sbox +.align 4 +.Lenc_loop: + ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x9], #64 + ldp q8, q9, [x9], #32 + eor v0.16b, v16.16b, v0.16b + ldr q10, [x9], #16 + eor v1.16b, v17.16b, v1.16b + ldr q16, [x9], #16 + eor v2.16b, v18.16b, v2.16b + eor v3.16b, v19.16b, v3.16b + eor v4.16b, v8.16b, v4.16b + eor v5.16b, v9.16b, v5.16b + eor v6.16b, v10.16b, v6.16b + eor v7.16b, v16.16b, v7.16b + tbl v0.16b, {v0.16b}, v28.16b + tbl v1.16b, {v1.16b}, v28.16b + tbl v2.16b, {v2.16b}, v28.16b + tbl v3.16b, {v3.16b}, v28.16b + tbl v4.16b, {v4.16b}, v28.16b + tbl v5.16b, {v5.16b}, v28.16b + tbl v6.16b, {v6.16b}, v28.16b + tbl v7.16b, {v7.16b}, v28.16b +.Lenc_sbox: + eor v5.16b, v5.16b, v6.16b + eor v3.16b, v3.16b, v0.16b + subs x10, x10, #1 + eor v2.16b, v2.16b, v1.16b + eor v5.16b, v5.16b, v0.16b + eor v8.16b, v3.16b, v7.16b + eor v6.16b, v6.16b, v2.16b + eor v7.16b, v7.16b, v5.16b + eor v8.16b, v8.16b, v4.16b + eor v3.16b, v6.16b, v3.16b + eor v4.16b, v4.16b, v5.16b + eor v6.16b, v1.16b, v5.16b + eor v2.16b, v2.16b, v7.16b + eor v1.16b, v8.16b, v1.16b + eor v8.16b, v7.16b, v4.16b + eor v9.16b, v3.16b, v0.16b + eor v10.16b, v7.16b, v6.16b + eor v16.16b, v5.16b, v3.16b + eor v17.16b, v6.16b, v2.16b + eor v18.16b, v5.16b, v1.16b + eor v19.16b, v2.16b, v4.16b + eor v20.16b, v1.16b, v0.16b + orr v21.16b, v8.16b, v9.16b + orr v22.16b, v10.16b, v16.16b + eor v23.16b, v8.16b, v17.16b + eor v24.16b, v9.16b, v18.16b + and v19.16b, v19.16b, v20.16b + orr v20.16b, v17.16b, v18.16b + and v8.16b, v8.16b, v9.16b + and v9.16b, v17.16b, v18.16b + and v17.16b, v23.16b, v24.16b + and v10.16b, v10.16b, v16.16b + eor v16.16b, v21.16b, v19.16b + eor v18.16b, v20.16b, v19.16b + and v19.16b, v2.16b, v1.16b + and v20.16b, v6.16b, v5.16b + eor v21.16b, v22.16b, v17.16b + eor v9.16b, v9.16b, v10.16b + eor v10.16b, v16.16b, v17.16b + eor v16.16b, v18.16b, v8.16b + and v17.16b, v4.16b, v0.16b + orr v18.16b, v7.16b, v3.16b + eor v21.16b, v21.16b, v8.16b + eor v8.16b, v9.16b, v8.16b + eor v9.16b, v10.16b, v19.16b + eor v10.16b, v3.16b, v0.16b + eor v16.16b, v16.16b, v17.16b + eor v17.16b, v5.16b, v1.16b + eor v19.16b, v21.16b, v20.16b + eor v20.16b, v8.16b, v18.16b + eor v8.16b, v8.16b, v18.16b + eor v18.16b, v7.16b, v4.16b + eor v21.16b, v9.16b, v16.16b + eor v22.16b, v6.16b, v2.16b + and v23.16b, v9.16b, v19.16b + eor v24.16b, v10.16b, v17.16b + eor v25.16b, v0.16b, v1.16b + eor v26.16b, v7.16b, v6.16b + eor v27.16b, v18.16b, v22.16b + eor v28.16b, v3.16b, v5.16b + eor v29.16b, v16.16b, v23.16b + eor v30.16b, v20.16b, v23.16b + eor v23.16b, v20.16b, v23.16b + eor v31.16b, v4.16b, v2.16b + bsl v29.16b, v19.16b, v20.16b + bsl v30.16b, v9.16b, v16.16b + bsl v8.16b, v29.16b, v23.16b + bsl v20.16b, v23.16b, v29.16b + eor v9.16b, v30.16b, v29.16b + and v5.16b, v5.16b, v30.16b + and v8.16b, v8.16b, v30.16b + and v1.16b, v1.16b, v29.16b + eor v16.16b, v19.16b, v20.16b + and v2.16b, v2.16b, v29.16b + eor v19.16b, v9.16b, v29.16b + and v17.16b, v17.16b, v9.16b + eor v8.16b, v8.16b, v21.16b + and v20.16b, v22.16b, v9.16b + eor v21.16b, v29.16b, v16.16b + eor v22.16b, v29.16b, v16.16b + and v23.16b, v25.16b, v16.16b + and v6.16b, v6.16b, v19.16b + eor v25.16b, v8.16b, v16.16b + eor v29.16b, v30.16b, v8.16b + and v4.16b, v21.16b, v4.16b + and v8.16b, v28.16b, v8.16b + and v0.16b, v22.16b, v0.16b + eor v21.16b, v23.16b, v1.16b + eor v22.16b, v9.16b, v25.16b + eor v9.16b, v9.16b, v25.16b + eor v23.16b, v25.16b, v16.16b + and v3.16b, v29.16b, v3.16b + and v24.16b, v24.16b, v25.16b + and v25.16b, v27.16b, v25.16b + and v10.16b, v22.16b, v10.16b + and v9.16b, v9.16b, v18.16b + eor v18.16b, v19.16b, v23.16b + and v19.16b, v26.16b, v23.16b + eor v3.16b, v5.16b, v3.16b + eor v17.16b, v17.16b, v24.16b + eor v10.16b, v24.16b, v10.16b + and v16.16b, v31.16b, v16.16b + eor v20.16b, v20.16b, v25.16b + eor v9.16b, v25.16b, v9.16b + eor v4.16b, v2.16b, v4.16b + and v7.16b, v18.16b, v7.16b + eor v18.16b, v19.16b, v6.16b + eor v5.16b, v8.16b, v5.16b + eor v0.16b, v1.16b, v0.16b + eor v1.16b, v21.16b, v10.16b + eor v8.16b, v3.16b, v17.16b + eor v2.16b, v16.16b, v2.16b + eor v3.16b, v6.16b, v7.16b + eor v6.16b, v18.16b, v9.16b + eor v4.16b, v4.16b, v20.16b + eor v10.16b, v5.16b, v10.16b + eor v0.16b, v0.16b, v17.16b + eor v9.16b, v2.16b, v9.16b + eor v3.16b, v3.16b, v20.16b + eor v7.16b, v6.16b, v1.16b + eor v5.16b, v8.16b, v4.16b + eor v6.16b, v10.16b, v1.16b + eor v2.16b, v4.16b, v0.16b + eor v4.16b, v3.16b, v10.16b + eor v9.16b, v9.16b, v7.16b + eor v3.16b, v0.16b, v5.16b + eor v0.16b, v1.16b, v4.16b + eor v1.16b, v4.16b, v8.16b + eor v4.16b, v9.16b, v5.16b + eor v6.16b, v6.16b, v3.16b + bcc .Lenc_done + ext v8.16b, v0.16b, v0.16b, #12 + ext v9.16b, v4.16b, v4.16b, #12 + ldr q28, [x11] + ext v10.16b, v6.16b, v6.16b, #12 + ext v16.16b, v1.16b, v1.16b, #12 + ext v17.16b, v3.16b, v3.16b, #12 + ext v18.16b, v7.16b, v7.16b, #12 + eor v0.16b, v0.16b, v8.16b + eor v4.16b, v4.16b, v9.16b + eor v6.16b, v6.16b, v10.16b + ext v19.16b, v2.16b, v2.16b, #12 + ext v20.16b, v5.16b, v5.16b, #12 + eor v1.16b, v1.16b, v16.16b + eor v3.16b, v3.16b, v17.16b + eor v7.16b, v7.16b, v18.16b + eor v2.16b, v2.16b, v19.16b + eor v16.16b, v16.16b, v0.16b + eor v5.16b, v5.16b, v20.16b + eor v17.16b, v17.16b, v6.16b + eor v10.16b, v10.16b, v4.16b + ext v0.16b, v0.16b, v0.16b, #8 + eor v9.16b, v9.16b, v1.16b + ext v1.16b, v1.16b, v1.16b, #8 + eor v8.16b, v8.16b, v5.16b + eor v16.16b, v16.16b, v5.16b + eor v18.16b, v18.16b, v3.16b + eor v19.16b, v19.16b, v7.16b + ext v3.16b, v3.16b, v3.16b, #8 + ext v7.16b, v7.16b, v7.16b, #8 + eor v20.16b, v20.16b, v2.16b + ext v6.16b, v6.16b, v6.16b, #8 + ext v21.16b, v5.16b, v5.16b, #8 + eor v17.16b, v17.16b, v5.16b + ext v2.16b, v2.16b, v2.16b, #8 + eor v10.16b, v10.16b, v5.16b + ext v22.16b, v4.16b, v4.16b, #8 + eor v0.16b, v0.16b, v8.16b + eor v1.16b, v1.16b, v16.16b + eor v5.16b, v7.16b, v18.16b + eor v4.16b, v3.16b, v17.16b + eor v3.16b, v6.16b, v10.16b + eor v7.16b, v21.16b, v20.16b + eor v6.16b, v2.16b, v19.16b + eor v2.16b, v22.16b, v9.16b + bne .Lenc_loop + ldr q28, [x11, #16]! // load from .LSRM0 on last round (x10 == 0) + b .Lenc_loop +.align 4 +.Lenc_done: + ushr v8.2d, v0.2d, #1 + movi v9.16b, #0x55 + ldr q10, [x9] + ushr v16.2d, v3.2d, #1 + movi v17.16b, #0x33 + ushr v18.2d, v4.2d, #1 + movi v19.16b, #0x0f + eor v8.16b, v8.16b, v1.16b + ushr v20.2d, v2.2d, #1 + eor v16.16b, v16.16b, v7.16b + eor v18.16b, v18.16b, v6.16b + and v8.16b, v8.16b, v9.16b + eor v20.16b, v20.16b, v5.16b + and v16.16b, v16.16b, v9.16b + and v18.16b, v18.16b, v9.16b + shl v21.2d, v8.2d, #1 + eor v1.16b, v1.16b, v8.16b + and v8.16b, v20.16b, v9.16b + eor v7.16b, v7.16b, v16.16b + shl v9.2d, v16.2d, #1 + eor v6.16b, v6.16b, v18.16b + shl v16.2d, v18.2d, #1 + eor v0.16b, v0.16b, v21.16b + shl v18.2d, v8.2d, #1 + eor v5.16b, v5.16b, v8.16b + eor v3.16b, v3.16b, v9.16b + eor v4.16b, v4.16b, v16.16b + ushr v8.2d, v1.2d, #2 + eor v2.16b, v2.16b, v18.16b + ushr v9.2d, v0.2d, #2 + ushr v16.2d, v7.2d, #2 + ushr v18.2d, v3.2d, #2 + eor v8.16b, v8.16b, v6.16b + eor v9.16b, v9.16b, v4.16b + eor v16.16b, v16.16b, v5.16b + eor v18.16b, v18.16b, v2.16b + and v8.16b, v8.16b, v17.16b + and v9.16b, v9.16b, v17.16b + and v16.16b, v16.16b, v17.16b + and v17.16b, v18.16b, v17.16b + eor v6.16b, v6.16b, v8.16b + shl v8.2d, v8.2d, #2 + eor v4.16b, v4.16b, v9.16b + shl v9.2d, v9.2d, #2 + eor v5.16b, v5.16b, v16.16b + shl v16.2d, v16.2d, #2 + eor v2.16b, v2.16b, v17.16b + shl v17.2d, v17.2d, #2 + eor v1.16b, v1.16b, v8.16b + eor v0.16b, v0.16b, v9.16b + eor v7.16b, v7.16b, v16.16b + eor v3.16b, v3.16b, v17.16b + ushr v8.2d, v6.2d, #4 + ushr v9.2d, v4.2d, #4 + ushr v16.2d, v1.2d, #4 + ushr v17.2d, v0.2d, #4 + eor v8.16b, v8.16b, v5.16b + eor v9.16b, v9.16b, v2.16b + eor v16.16b, v16.16b, v7.16b + eor v17.16b, v17.16b, v3.16b + and v8.16b, v8.16b, v19.16b + and v9.16b, v9.16b, v19.16b + and v16.16b, v16.16b, v19.16b + and v17.16b, v17.16b, v19.16b + eor v5.16b, v5.16b, v8.16b + shl v8.2d, v8.2d, #4 + eor v2.16b, v2.16b, v9.16b + shl v9.2d, v9.2d, #4 + eor v7.16b, v7.16b, v16.16b + shl v16.2d, v16.2d, #4 + eor v3.16b, v3.16b, v17.16b + shl v17.2d, v17.2d, #4 + eor v6.16b, v6.16b, v8.16b + eor v4.16b, v4.16b, v9.16b + eor v7.16b, v7.16b, v10.16b + eor v1.16b, v1.16b, v16.16b + eor v3.16b, v3.16b, v10.16b + eor v0.16b, v0.16b, v17.16b + eor v6.16b, v6.16b, v10.16b + eor v4.16b, v4.16b, v10.16b + eor v2.16b, v2.16b, v10.16b + eor v5.16b, v5.16b, v10.16b + eor v1.16b, v1.16b, v10.16b + eor v0.16b, v0.16b, v10.16b + ret +.size _bsaes_encrypt8,.-_bsaes_encrypt8 + +.type _bsaes_key_convert,%function +.align 4 +// On entry: +// x9 -> input key (big-endian) +// x10 = number of rounds +// x17 -> output key (native endianness) +// On exit: +// x9, x10 corrupted +// x11 -> .LM0_bigendian +// x17 -> last quadword of output key +// other general-purpose registers preserved +// v2-v6 preserved +// v7.16b[] = 0x63 +// v8-v14 preserved +// v15 = last round key (converted to native endianness) +// other SIMD registers corrupted +_bsaes_key_convert: +#ifdef __AARCH64EL__ + adrp x11, .LM0_littleendian + add x11, x11, #:lo12:.LM0_littleendian +#else + adrp x11, .LM0_bigendian + add x11, x11, #:lo12:.LM0_bigendian +#endif + ldr q0, [x9], #16 // load round 0 key + ldr q1, [x11] // .LM0 + ldr q15, [x9], #16 // load round 1 key + + movi v7.16b, #0x63 // compose .L63 + movi v16.16b, #0x01 // bit masks + movi v17.16b, #0x02 + movi v18.16b, #0x04 + movi v19.16b, #0x08 + movi v20.16b, #0x10 + movi v21.16b, #0x20 + movi v22.16b, #0x40 + movi v23.16b, #0x80 + +#ifdef __AARCH64EL__ + rev32 v0.16b, v0.16b +#endif + sub x10, x10, #1 + str q0, [x17], #16 // save round 0 key + +.align 4 +.Lkey_loop: + tbl v0.16b, {v15.16b}, v1.16b + ldr q15, [x9], #16 // load next round key + + eor v0.16b, v0.16b, v7.16b + cmtst v24.16b, v0.16b, v16.16b + cmtst v25.16b, v0.16b, v17.16b + cmtst v26.16b, v0.16b, v18.16b + cmtst v27.16b, v0.16b, v19.16b + cmtst v28.16b, v0.16b, v20.16b + cmtst v29.16b, v0.16b, v21.16b + cmtst v30.16b, v0.16b, v22.16b + cmtst v31.16b, v0.16b, v23.16b + sub x10, x10, #1 + st1 {v24.16b,v25.16b,v26.16b,v27.16b}, [x17], #64 // write bit-sliced round key + st1 {v28.16b,v29.16b,v30.16b,v31.16b}, [x17], #64 + cbnz x10, .Lkey_loop + + // don't save last round key +#ifdef __AARCH64EL__ + rev32 v15.16b, v15.16b + adrp x11, .LM0_bigendian + add x11, x11, #:lo12:.LM0_bigendian +#endif + ret +.size _bsaes_key_convert,.-_bsaes_key_convert + +.globl ossl_bsaes_cbc_encrypt +.type ossl_bsaes_cbc_encrypt,%function +.align 4 +// On entry: +// x0 -> input ciphertext +// x1 -> output plaintext +// x2 = size of ciphertext and plaintext in bytes (assumed a multiple of 16) +// x3 -> key +// x4 -> 128-bit initialisation vector (or preceding 128-bit block of ciphertext if continuing after an earlier call) +// w5 must be == 0 +// On exit: +// Output plaintext filled in +// Initialisation vector overwritten with last quadword of ciphertext +// No output registers, usual AAPCS64 register preservation +ossl_bsaes_cbc_encrypt: + AARCH64_VALID_CALL_TARGET + cmp x2, #128 + bhs .Lcbc_do_bsaes + b AES_cbc_encrypt +.Lcbc_do_bsaes: + + // it is up to the caller to make sure we are called with enc == 0 + + stp x29, x30, [sp, #-48]! + stp d8, d9, [sp, #16] + stp d10, d15, [sp, #32] + lsr x2, x2, #4 // len in 16 byte blocks + + ldr w15, [x3, #240] // get # of rounds + mov x14, sp + + // allocate the key schedule on the stack + add x17, sp, #96 + sub x17, x17, x15, lsl #7 // 128 bytes per inner round key, less 96 bytes + + // populate the key schedule + mov x9, x3 // pass key + mov x10, x15 // pass # of rounds + mov sp, x17 // sp is sp + bl _bsaes_key_convert + ldr q6, [sp] + str q15, [x17] // save last round key + eor v6.16b, v6.16b, v7.16b // fix up round 0 key (by XORing with 0x63) + str q6, [sp] + + ldr q15, [x4] // load IV + b .Lcbc_dec_loop + +.align 4 +.Lcbc_dec_loop: + subs x2, x2, #0x8 + bmi .Lcbc_dec_loop_finish + + ldr q0, [x0], #16 // load input + mov x9, sp // pass the key + ldr q1, [x0], #16 + mov x10, x15 + ldr q2, [x0], #16 + ldr q3, [x0], #16 + ldr q4, [x0], #16 + ldr q5, [x0], #16 + ldr q6, [x0], #16 + ldr q7, [x0], #-7*16 + + bl _bsaes_decrypt8 + + ldr q16, [x0], #16 // reload input + eor v0.16b, v0.16b, v15.16b // ^= IV + eor v1.16b, v1.16b, v16.16b + str q0, [x1], #16 // write output + ldr q0, [x0], #16 + str q1, [x1], #16 + ldr q1, [x0], #16 + eor v1.16b, v4.16b, v1.16b + ldr q4, [x0], #16 + eor v2.16b, v2.16b, v4.16b + eor v0.16b, v6.16b, v0.16b + ldr q4, [x0], #16 + str q0, [x1], #16 + str q1, [x1], #16 + eor v0.16b, v7.16b, v4.16b + ldr q1, [x0], #16 + str q2, [x1], #16 + ldr q2, [x0], #16 + ldr q15, [x0], #16 + str q0, [x1], #16 + eor v0.16b, v5.16b, v2.16b + eor v1.16b, v3.16b, v1.16b + str q1, [x1], #16 + str q0, [x1], #16 + + b .Lcbc_dec_loop + +.Lcbc_dec_loop_finish: + adds x2, x2, #8 + beq .Lcbc_dec_done + + ldr q0, [x0], #16 // load input + cmp x2, #2 + blo .Lcbc_dec_one + ldr q1, [x0], #16 + mov x9, sp // pass the key + mov x10, x15 + beq .Lcbc_dec_two + ldr q2, [x0], #16 + cmp x2, #4 + blo .Lcbc_dec_three + ldr q3, [x0], #16 + beq .Lcbc_dec_four + ldr q4, [x0], #16 + cmp x2, #6 + blo .Lcbc_dec_five + ldr q5, [x0], #16 + beq .Lcbc_dec_six + ldr q6, [x0], #-6*16 + + bl _bsaes_decrypt8 + + ldr q5, [x0], #16 // reload input + eor v0.16b, v0.16b, v15.16b // ^= IV + ldr q8, [x0], #16 + ldr q9, [x0], #16 + ldr q10, [x0], #16 + str q0, [x1], #16 // write output + ldr q0, [x0], #16 + eor v1.16b, v1.16b, v5.16b + ldr q5, [x0], #16 + eor v6.16b, v6.16b, v8.16b + ldr q15, [x0] + eor v4.16b, v4.16b, v9.16b + eor v2.16b, v2.16b, v10.16b + str q1, [x1], #16 + eor v0.16b, v7.16b, v0.16b + str q6, [x1], #16 + eor v1.16b, v3.16b, v5.16b + str q4, [x1], #16 + str q2, [x1], #16 + str q0, [x1], #16 + str q1, [x1] + b .Lcbc_dec_done +.align 4 +.Lcbc_dec_six: + sub x0, x0, #0x60 + bl _bsaes_decrypt8 + ldr q3, [x0], #16 // reload input + eor v0.16b, v0.16b, v15.16b // ^= IV + ldr q5, [x0], #16 + ldr q8, [x0], #16 + ldr q9, [x0], #16 + str q0, [x1], #16 // write output + ldr q0, [x0], #16 + eor v1.16b, v1.16b, v3.16b + ldr q15, [x0] + eor v3.16b, v6.16b, v5.16b + eor v4.16b, v4.16b, v8.16b + eor v2.16b, v2.16b, v9.16b + str q1, [x1], #16 + eor v0.16b, v7.16b, v0.16b + str q3, [x1], #16 + str q4, [x1], #16 + str q2, [x1], #16 + str q0, [x1] + b .Lcbc_dec_done +.align 4 +.Lcbc_dec_five: + sub x0, x0, #0x50 + bl _bsaes_decrypt8 + ldr q3, [x0], #16 // reload input + eor v0.16b, v0.16b, v15.16b // ^= IV + ldr q5, [x0], #16 + ldr q7, [x0], #16 + ldr q8, [x0], #16 + str q0, [x1], #16 // write output + ldr q15, [x0] + eor v0.16b, v1.16b, v3.16b + eor v1.16b, v6.16b, v5.16b + eor v3.16b, v4.16b, v7.16b + str q0, [x1], #16 + eor v0.16b, v2.16b, v8.16b + str q1, [x1], #16 + str q3, [x1], #16 + str q0, [x1] + b .Lcbc_dec_done +.align 4 +.Lcbc_dec_four: + sub x0, x0, #0x40 + bl _bsaes_decrypt8 + ldr q2, [x0], #16 // reload input + eor v0.16b, v0.16b, v15.16b // ^= IV + ldr q3, [x0], #16 + ldr q5, [x0], #16 + str q0, [x1], #16 // write output + ldr q15, [x0] + eor v0.16b, v1.16b, v2.16b + eor v1.16b, v6.16b, v3.16b + eor v2.16b, v4.16b, v5.16b + str q0, [x1], #16 + str q1, [x1], #16 + str q2, [x1] + b .Lcbc_dec_done +.align 4 +.Lcbc_dec_three: + sub x0, x0, #0x30 + bl _bsaes_decrypt8 + ldr q2, [x0], #16 // reload input + eor v0.16b, v0.16b, v15.16b // ^= IV + ldr q3, [x0], #16 + ldr q15, [x0] + str q0, [x1], #16 // write output + eor v0.16b, v1.16b, v2.16b + eor v1.16b, v6.16b, v3.16b + str q0, [x1], #16 + str q1, [x1] + b .Lcbc_dec_done +.align 4 +.Lcbc_dec_two: + sub x0, x0, #0x20 + bl _bsaes_decrypt8 + ldr q2, [x0], #16 // reload input + eor v0.16b, v0.16b, v15.16b // ^= IV + ldr q15, [x0] + str q0, [x1], #16 // write output + eor v0.16b, v1.16b, v2.16b + str q0, [x1] + b .Lcbc_dec_done +.align 4 +.Lcbc_dec_one: + sub x0, x0, #0x10 + stp x1, x4, [sp, #-32]! + str x14, [sp, #16] + mov v8.16b, v15.16b + mov v15.16b, v0.16b + mov x2, x3 + bl AES_decrypt + ldr x14, [sp, #16] + ldp x1, x4, [sp], #32 + ldr q0, [x1] // load result + eor v0.16b, v0.16b, v8.16b // ^= IV + str q0, [x1] // write output + +.align 4 +.Lcbc_dec_done: + movi v0.16b, #0 + movi v1.16b, #0 +.Lcbc_dec_bzero: // wipe key schedule [if any] + stp q0, q1, [sp], #32 + cmp sp, x14 + bne .Lcbc_dec_bzero + str q15, [x4] // return IV + ldp d8, d9, [sp, #16] + ldp d10, d15, [sp, #32] + ldp x29, x30, [sp], #48 + ret +.size ossl_bsaes_cbc_encrypt,.-ossl_bsaes_cbc_encrypt + +.globl ossl_bsaes_ctr32_encrypt_blocks +.type ossl_bsaes_ctr32_encrypt_blocks,%function +.align 4 +// On entry: +// x0 -> input text (whole 16-byte blocks) +// x1 -> output text (whole 16-byte blocks) +// x2 = number of 16-byte blocks to encrypt/decrypt (> 0) +// x3 -> key +// x4 -> initial value of 128-bit counter (stored big-endian) which increments, modulo 2^32, for each block +// On exit: +// Output text filled in +// No output registers, usual AAPCS64 register preservation +ossl_bsaes_ctr32_encrypt_blocks: + AARCH64_VALID_CALL_TARGET + cmp x2, #8 // use plain AES for + blo .Lctr_enc_short // small sizes + + stp x29, x30, [sp, #-80]! + stp d8, d9, [sp, #16] + stp d10, d11, [sp, #32] + stp d12, d13, [sp, #48] + stp d14, d15, [sp, #64] + + ldr w15, [x3, #240] // get # of rounds + mov x14, sp + + // allocate the key schedule on the stack + add x17, sp, #96 + sub x17, x17, x15, lsl #7 // 128 bytes per inner round key, less 96 bytes + + // populate the key schedule + mov x9, x3 // pass key + mov x10, x15 // pass # of rounds + mov sp, x17 // sp is sp + bl _bsaes_key_convert + eor v7.16b, v7.16b, v15.16b // fix up last round key + str q7, [x17] // save last round key + + ldr q0, [x4] // load counter + add x13, x11, #.LREVM0SR-.LM0_bigendian + ldr q4, [sp] // load round0 key + + movi v8.4s, #1 // compose 1<<96 + movi v9.16b, #0 + rev32 v15.16b, v0.16b + rev32 v0.16b, v0.16b + ext v11.16b, v9.16b, v8.16b, #4 + rev32 v4.16b, v4.16b + add v12.4s, v11.4s, v11.4s // compose 2<<96 + str q4, [sp] // save adjusted round0 key + add v13.4s, v11.4s, v12.4s // compose 3<<96 + add v14.4s, v12.4s, v12.4s // compose 4<<96 + b .Lctr_enc_loop + +.align 4 +.Lctr_enc_loop: + // Intermix prologue from _bsaes_encrypt8 to use the opportunity + // to flip byte order in 32-bit counter + + add v1.4s, v15.4s, v11.4s // +1 + add x9, sp, #0x10 // pass next round key + add v2.4s, v15.4s, v12.4s // +2 + ldr q9, [x13] // .LREVM0SR + ldr q8, [sp] // load round0 key + add v3.4s, v15.4s, v13.4s // +3 + mov x10, x15 // pass rounds + sub x11, x13, #.LREVM0SR-.LSR // pass constants + add v6.4s, v2.4s, v14.4s + add v4.4s, v15.4s, v14.4s // +4 + add v7.4s, v3.4s, v14.4s + add v15.4s, v4.4s, v14.4s // next counter + add v5.4s, v1.4s, v14.4s + + bl _bsaes_encrypt8_alt + + subs x2, x2, #8 + blo .Lctr_enc_loop_done + + ldr q16, [x0], #16 + ldr q17, [x0], #16 + eor v1.16b, v1.16b, v17.16b + ldr q17, [x0], #16 + eor v0.16b, v0.16b, v16.16b + eor v4.16b, v4.16b, v17.16b + str q0, [x1], #16 + ldr q16, [x0], #16 + str q1, [x1], #16 + mov v0.16b, v15.16b + str q4, [x1], #16 + ldr q1, [x0], #16 + eor v4.16b, v6.16b, v16.16b + eor v1.16b, v3.16b, v1.16b + ldr q3, [x0], #16 + eor v3.16b, v7.16b, v3.16b + ldr q6, [x0], #16 + eor v2.16b, v2.16b, v6.16b + ldr q6, [x0], #16 + eor v5.16b, v5.16b, v6.16b + str q4, [x1], #16 + str q1, [x1], #16 + str q3, [x1], #16 + str q2, [x1], #16 + str q5, [x1], #16 + + bne .Lctr_enc_loop + b .Lctr_enc_done + +.align 4 +.Lctr_enc_loop_done: + add x2, x2, #8 + ldr q16, [x0], #16 // load input + eor v0.16b, v0.16b, v16.16b + str q0, [x1], #16 // write output + cmp x2, #2 + blo .Lctr_enc_done + ldr q17, [x0], #16 + eor v1.16b, v1.16b, v17.16b + str q1, [x1], #16 + beq .Lctr_enc_done + ldr q18, [x0], #16 + eor v4.16b, v4.16b, v18.16b + str q4, [x1], #16 + cmp x2, #4 + blo .Lctr_enc_done + ldr q19, [x0], #16 + eor v6.16b, v6.16b, v19.16b + str q6, [x1], #16 + beq .Lctr_enc_done + ldr q20, [x0], #16 + eor v3.16b, v3.16b, v20.16b + str q3, [x1], #16 + cmp x2, #6 + blo .Lctr_enc_done + ldr q21, [x0], #16 + eor v7.16b, v7.16b, v21.16b + str q7, [x1], #16 + beq .Lctr_enc_done + ldr q22, [x0] + eor v2.16b, v2.16b, v22.16b + str q2, [x1], #16 + +.Lctr_enc_done: + movi v0.16b, #0 + movi v1.16b, #0 +.Lctr_enc_bzero: // wipe key schedule [if any] + stp q0, q1, [sp], #32 + cmp sp, x14 + bne .Lctr_enc_bzero + + ldp d8, d9, [sp, #16] + ldp d10, d11, [sp, #32] + ldp d12, d13, [sp, #48] + ldp d14, d15, [sp, #64] + ldp x29, x30, [sp], #80 + ret + +.Lctr_enc_short: + stp x29, x30, [sp, #-96]! + stp x19, x20, [sp, #16] + stp x21, x22, [sp, #32] + str x23, [sp, #48] + + mov x19, x0 // copy arguments + mov x20, x1 + mov x21, x2 + mov x22, x3 + ldr w23, [x4, #12] // load counter .LSW + ldr q1, [x4] // load whole counter value +#ifdef __AARCH64EL__ + rev w23, w23 +#endif + str q1, [sp, #80] // copy counter value + +.Lctr_enc_short_loop: + add x0, sp, #80 // input counter value + add x1, sp, #64 // output on the stack + mov x2, x22 // key + + bl AES_encrypt + + ldr q0, [x19], #16 // load input + ldr q1, [sp, #64] // load encrypted counter + add x23, x23, #1 +#ifdef __AARCH64EL__ + rev w0, w23 + str w0, [sp, #80+12] // next counter value +#else + str w23, [sp, #80+12] // next counter value +#endif + eor v0.16b, v0.16b, v1.16b + str q0, [x20], #16 // store output + subs x21, x21, #1 + bne .Lctr_enc_short_loop + + movi v0.16b, #0 + movi v1.16b, #0 + stp q0, q1, [sp, #64] + + ldr x23, [sp, #48] + ldp x21, x22, [sp, #32] + ldp x19, x20, [sp, #16] + ldp x29, x30, [sp], #96 + ret +.size ossl_bsaes_ctr32_encrypt_blocks,.-ossl_bsaes_ctr32_encrypt_blocks + +.globl ossl_bsaes_xts_encrypt +.type ossl_bsaes_xts_encrypt,%function +.align 4 +// On entry: +// x0 -> input plaintext +// x1 -> output ciphertext +// x2 -> length of text in bytes (must be at least 16) +// x3 -> key1 (used to encrypt the XORed plaintext blocks) +// x4 -> key2 (used to encrypt the initial vector to yield the initial tweak) +// x5 -> 16-byte initial vector (typically, sector number) +// On exit: +// Output ciphertext filled in +// No output registers, usual AAPCS64 register preservation +ossl_bsaes_xts_encrypt: + AARCH64_VALID_CALL_TARGET + // Stack layout: + // sp -> + // nrounds*128-96 bytes: key schedule + // x19 -> + // 16 bytes: frame record + // 4*16 bytes: tweak storage across _bsaes_encrypt8 + // 6*8 bytes: storage for 5 callee-saved general-purpose registers + // 8*8 bytes: storage for 8 callee-saved SIMD registers + stp x29, x30, [sp, #-192]! + stp x19, x20, [sp, #80] + stp x21, x22, [sp, #96] + str x23, [sp, #112] + stp d8, d9, [sp, #128] + stp d10, d11, [sp, #144] + stp d12, d13, [sp, #160] + stp d14, d15, [sp, #176] + + mov x19, sp + mov x20, x0 + mov x21, x1 + mov x22, x2 + mov x23, x3 + + // generate initial tweak + sub sp, sp, #16 + mov x0, x5 // iv[] + mov x1, sp + mov x2, x4 // key2 + bl AES_encrypt + ldr q11, [sp], #16 + + ldr w1, [x23, #240] // get # of rounds + // allocate the key schedule on the stack + add x17, sp, #96 + sub x17, x17, x1, lsl #7 // 128 bytes per inner round key, less 96 bytes + + // populate the key schedule + mov x9, x23 // pass key + mov x10, x1 // pass # of rounds + mov sp, x17 + bl _bsaes_key_convert + eor v15.16b, v15.16b, v7.16b // fix up last round key + str q15, [x17] // save last round key + + subs x22, x22, #0x80 + blo .Lxts_enc_short + b .Lxts_enc_loop + +.align 4 +.Lxts_enc_loop: + ldr q8, .Lxts_magic + mov x10, x1 // pass rounds + add x2, x19, #16 + ldr q0, [x20], #16 + sshr v1.2d, v11.2d, #63 + mov x9, sp // pass key schedule + ldr q6, .Lxts_magic+16 + add v2.2d, v11.2d, v11.2d + cmtst v3.2d, v11.2d, v6.2d + and v1.16b, v1.16b, v8.16b + ext v1.16b, v1.16b, v1.16b, #8 + and v3.16b, v3.16b, v8.16b + ldr q4, [x20], #16 + eor v12.16b, v2.16b, v1.16b + eor v1.16b, v4.16b, v12.16b + eor v0.16b, v0.16b, v11.16b + cmtst v2.2d, v12.2d, v6.2d + add v4.2d, v12.2d, v12.2d + add x0, x19, #16 + ext v3.16b, v3.16b, v3.16b, #8 + and v2.16b, v2.16b, v8.16b + eor v13.16b, v4.16b, v3.16b + ldr q3, [x20], #16 + ext v4.16b, v2.16b, v2.16b, #8 + eor v2.16b, v3.16b, v13.16b + ldr q3, [x20], #16 + add v5.2d, v13.2d, v13.2d + cmtst v7.2d, v13.2d, v6.2d + and v7.16b, v7.16b, v8.16b + ldr q9, [x20], #16 + ext v7.16b, v7.16b, v7.16b, #8 + ldr q10, [x20], #16 + eor v14.16b, v5.16b, v4.16b + ldr q16, [x20], #16 + add v4.2d, v14.2d, v14.2d + eor v3.16b, v3.16b, v14.16b + eor v15.16b, v4.16b, v7.16b + add v5.2d, v15.2d, v15.2d + ldr q7, [x20], #16 + cmtst v4.2d, v14.2d, v6.2d + and v17.16b, v4.16b, v8.16b + cmtst v18.2d, v15.2d, v6.2d + eor v4.16b, v9.16b, v15.16b + ext v9.16b, v17.16b, v17.16b, #8 + eor v9.16b, v5.16b, v9.16b + add v17.2d, v9.2d, v9.2d + and v18.16b, v18.16b, v8.16b + eor v5.16b, v10.16b, v9.16b + str q9, [x2], #16 + ext v10.16b, v18.16b, v18.16b, #8 + cmtst v9.2d, v9.2d, v6.2d + and v9.16b, v9.16b, v8.16b + eor v10.16b, v17.16b, v10.16b + cmtst v17.2d, v10.2d, v6.2d + eor v6.16b, v16.16b, v10.16b + str q10, [x2], #16 + ext v9.16b, v9.16b, v9.16b, #8 + add v10.2d, v10.2d, v10.2d + eor v9.16b, v10.16b, v9.16b + str q9, [x2], #16 + eor v7.16b, v7.16b, v9.16b + add v9.2d, v9.2d, v9.2d + and v8.16b, v17.16b, v8.16b + ext v8.16b, v8.16b, v8.16b, #8 + eor v8.16b, v9.16b, v8.16b + str q8, [x2] // next round tweak + + bl _bsaes_encrypt8 + + ldr q8, [x0], #16 + eor v0.16b, v0.16b, v11.16b + eor v1.16b, v1.16b, v12.16b + ldr q9, [x0], #16 + eor v4.16b, v4.16b, v13.16b + eor v6.16b, v6.16b, v14.16b + ldr q10, [x0], #16 + eor v3.16b, v3.16b, v15.16b + subs x22, x22, #0x80 + str q0, [x21], #16 + ldr q11, [x0] // next round tweak + str q1, [x21], #16 + eor v0.16b, v7.16b, v8.16b + eor v1.16b, v2.16b, v9.16b + str q4, [x21], #16 + eor v2.16b, v5.16b, v10.16b + str q6, [x21], #16 + str q3, [x21], #16 + str q0, [x21], #16 + str q1, [x21], #16 + str q2, [x21], #16 + bpl .Lxts_enc_loop + +.Lxts_enc_short: + adds x22, x22, #0x70 + bmi .Lxts_enc_done + + ldr q8, .Lxts_magic + sshr v1.2d, v11.2d, #63 + add v2.2d, v11.2d, v11.2d + ldr q9, .Lxts_magic+16 + subs x22, x22, #0x10 + ldr q0, [x20], #16 + and v1.16b, v1.16b, v8.16b + cmtst v3.2d, v11.2d, v9.2d + ext v1.16b, v1.16b, v1.16b, #8 + and v3.16b, v3.16b, v8.16b + eor v12.16b, v2.16b, v1.16b + ext v1.16b, v3.16b, v3.16b, #8 + add v2.2d, v12.2d, v12.2d + cmtst v3.2d, v12.2d, v9.2d + eor v13.16b, v2.16b, v1.16b + and v22.16b, v3.16b, v8.16b + bmi .Lxts_enc_1 + + ext v2.16b, v22.16b, v22.16b, #8 + add v3.2d, v13.2d, v13.2d + ldr q1, [x20], #16 + cmtst v4.2d, v13.2d, v9.2d + subs x22, x22, #0x10 + eor v14.16b, v3.16b, v2.16b + and v23.16b, v4.16b, v8.16b + bmi .Lxts_enc_2 + + ext v3.16b, v23.16b, v23.16b, #8 + add v4.2d, v14.2d, v14.2d + ldr q2, [x20], #16 + cmtst v5.2d, v14.2d, v9.2d + eor v0.16b, v0.16b, v11.16b + subs x22, x22, #0x10 + eor v15.16b, v4.16b, v3.16b + and v24.16b, v5.16b, v8.16b + bmi .Lxts_enc_3 + + ext v4.16b, v24.16b, v24.16b, #8 + add v5.2d, v15.2d, v15.2d + ldr q3, [x20], #16 + cmtst v6.2d, v15.2d, v9.2d + eor v1.16b, v1.16b, v12.16b + subs x22, x22, #0x10 + eor v16.16b, v5.16b, v4.16b + and v25.16b, v6.16b, v8.16b + bmi .Lxts_enc_4 + + ext v5.16b, v25.16b, v25.16b, #8 + add v6.2d, v16.2d, v16.2d + add x0, x19, #16 + cmtst v7.2d, v16.2d, v9.2d + ldr q4, [x20], #16 + eor v2.16b, v2.16b, v13.16b + str q16, [x0], #16 + subs x22, x22, #0x10 + eor v17.16b, v6.16b, v5.16b + and v26.16b, v7.16b, v8.16b + bmi .Lxts_enc_5 + + ext v7.16b, v26.16b, v26.16b, #8 + add v18.2d, v17.2d, v17.2d + ldr q5, [x20], #16 + eor v3.16b, v3.16b, v14.16b + str q17, [x0], #16 + subs x22, x22, #0x10 + eor v18.16b, v18.16b, v7.16b + bmi .Lxts_enc_6 + + ldr q6, [x20], #16 + eor v4.16b, v4.16b, v15.16b + eor v5.16b, v5.16b, v16.16b + str q18, [x0] // next round tweak + mov x9, sp // pass key schedule + mov x10, x1 + add x0, x19, #16 + sub x22, x22, #0x10 + eor v6.16b, v6.16b, v17.16b + + bl _bsaes_encrypt8 + + ldr q16, [x0], #16 + eor v0.16b, v0.16b, v11.16b + eor v1.16b, v1.16b, v12.16b + ldr q17, [x0], #16 + eor v4.16b, v4.16b, v13.16b + eor v6.16b, v6.16b, v14.16b + eor v3.16b, v3.16b, v15.16b + ldr q11, [x0] // next round tweak + str q0, [x21], #16 + str q1, [x21], #16 + eor v0.16b, v7.16b, v16.16b + eor v1.16b, v2.16b, v17.16b + str q4, [x21], #16 + str q6, [x21], #16 + str q3, [x21], #16 + str q0, [x21], #16 + str q1, [x21], #16 + b .Lxts_enc_done + +.align 4 +.Lxts_enc_6: + eor v4.16b, v4.16b, v15.16b + eor v5.16b, v5.16b, v16.16b + mov x9, sp // pass key schedule + mov x10, x1 // pass rounds + add x0, x19, #16 + + bl _bsaes_encrypt8 + + ldr q16, [x0], #16 + eor v0.16b, v0.16b, v11.16b + eor v1.16b, v1.16b, v12.16b + eor v4.16b, v4.16b, v13.16b + eor v6.16b, v6.16b, v14.16b + ldr q11, [x0] // next round tweak + eor v3.16b, v3.16b, v15.16b + str q0, [x21], #16 + str q1, [x21], #16 + eor v0.16b, v7.16b, v16.16b + str q4, [x21], #16 + str q6, [x21], #16 + str q3, [x21], #16 + str q0, [x21], #16 + b .Lxts_enc_done + +.align 4 +.Lxts_enc_5: + eor v3.16b, v3.16b, v14.16b + eor v4.16b, v4.16b, v15.16b + mov x9, sp // pass key schedule + mov x10, x1 // pass rounds + add x0, x19, #16 + + bl _bsaes_encrypt8 + + eor v0.16b, v0.16b, v11.16b + eor v1.16b, v1.16b, v12.16b + ldr q11, [x0] // next round tweak + eor v4.16b, v4.16b, v13.16b + eor v6.16b, v6.16b, v14.16b + eor v3.16b, v3.16b, v15.16b + str q0, [x21], #16 + str q1, [x21], #16 + str q4, [x21], #16 + str q6, [x21], #16 + str q3, [x21], #16 + b .Lxts_enc_done + +.align 4 +.Lxts_enc_4: + eor v2.16b, v2.16b, v13.16b + eor v3.16b, v3.16b, v14.16b + mov x9, sp // pass key schedule + mov x10, x1 // pass rounds + add x0, x19, #16 + + bl _bsaes_encrypt8 + + eor v0.16b, v0.16b, v11.16b + eor v1.16b, v1.16b, v12.16b + eor v4.16b, v4.16b, v13.16b + eor v6.16b, v6.16b, v14.16b + mov v11.16b, v15.16b // next round tweak + str q0, [x21], #16 + str q1, [x21], #16 + str q4, [x21], #16 + str q6, [x21], #16 + b .Lxts_enc_done + +.align 4 +.Lxts_enc_3: + eor v1.16b, v1.16b, v12.16b + eor v2.16b, v2.16b, v13.16b + mov x9, sp // pass key schedule + mov x10, x1 // pass rounds + add x0, x19, #16 + + bl _bsaes_encrypt8 + + eor v0.16b, v0.16b, v11.16b + eor v1.16b, v1.16b, v12.16b + eor v4.16b, v4.16b, v13.16b + mov v11.16b, v14.16b // next round tweak + str q0, [x21], #16 + str q1, [x21], #16 + str q4, [x21], #16 + b .Lxts_enc_done + +.align 4 +.Lxts_enc_2: + eor v0.16b, v0.16b, v11.16b + eor v1.16b, v1.16b, v12.16b + mov x9, sp // pass key schedule + mov x10, x1 // pass rounds + add x0, x19, #16 + + bl _bsaes_encrypt8 + + eor v0.16b, v0.16b, v11.16b + eor v1.16b, v1.16b, v12.16b + mov v11.16b, v13.16b // next round tweak + str q0, [x21], #16 + str q1, [x21], #16 + b .Lxts_enc_done + +.align 4 +.Lxts_enc_1: + eor v0.16b, v0.16b, v11.16b + sub x0, sp, #16 + sub x1, sp, #16 + mov x2, x23 + mov v13.d[0], v11.d[1] // just in case AES_encrypt corrupts top half of callee-saved SIMD registers + mov v14.d[0], v12.d[1] + str q0, [sp, #-16]! + + bl AES_encrypt + + ldr q0, [sp], #16 + trn1 v13.2d, v11.2d, v13.2d + trn1 v11.2d, v12.2d, v14.2d // next round tweak + eor v0.16b, v0.16b, v13.16b + str q0, [x21], #16 + +.Lxts_enc_done: + adds x22, x22, #0x10 + beq .Lxts_enc_ret + + sub x6, x21, #0x10 + // Penultimate plaintext block produces final ciphertext part-block + // plus remaining part of final plaintext block. Move ciphertext part + // to final position and reuse penultimate ciphertext block buffer to + // construct final plaintext block +.Lxts_enc_steal: + ldrb w0, [x20], #1 + ldrb w1, [x21, #-0x10] + strb w0, [x21, #-0x10] + strb w1, [x21], #1 + + subs x22, x22, #1 + bhi .Lxts_enc_steal + + // Finally encrypt the penultimate ciphertext block using the + // last tweak + ldr q0, [x6] + eor v0.16b, v0.16b, v11.16b + str q0, [sp, #-16]! + mov x0, sp + mov x1, sp + mov x2, x23 + mov x21, x6 + mov v13.d[0], v11.d[1] // just in case AES_encrypt corrupts top half of callee-saved SIMD registers + + bl AES_encrypt + + trn1 v11.2d, v11.2d, v13.2d + ldr q0, [sp], #16 + eor v0.16b, v0.16b, v11.16b + str q0, [x21] + +.Lxts_enc_ret: + + movi v0.16b, #0 + movi v1.16b, #0 +.Lxts_enc_bzero: // wipe key schedule + stp q0, q1, [sp], #32 + cmp sp, x19 + bne .Lxts_enc_bzero + + ldp x19, x20, [sp, #80] + ldp x21, x22, [sp, #96] + ldr x23, [sp, #112] + ldp d8, d9, [sp, #128] + ldp d10, d11, [sp, #144] + ldp d12, d13, [sp, #160] + ldp d14, d15, [sp, #176] + ldp x29, x30, [sp], #192 + ret +.size ossl_bsaes_xts_encrypt,.-ossl_bsaes_xts_encrypt + +// The assembler doesn't seem capable of de-duplicating these when expressed +// using `ldr qd,=` syntax, so assign a symbolic address +.align 5 +.Lxts_magic: +.quad 1, 0x87, 0x4000000000000000, 0x4000000000000000 + +.globl ossl_bsaes_xts_decrypt +.type ossl_bsaes_xts_decrypt,%function +.align 4 +// On entry: +// x0 -> input ciphertext +// x1 -> output plaintext +// x2 -> length of text in bytes (must be at least 16) +// x3 -> key1 (used to decrypt the XORed ciphertext blocks) +// x4 -> key2 (used to encrypt the initial vector to yield the initial tweak) +// x5 -> 16-byte initial vector (typically, sector number) +// On exit: +// Output plaintext filled in +// No output registers, usual AAPCS64 register preservation +ossl_bsaes_xts_decrypt: + AARCH64_VALID_CALL_TARGET + // Stack layout: + // sp -> + // nrounds*128-96 bytes: key schedule + // x19 -> + // 16 bytes: frame record + // 4*16 bytes: tweak storage across _bsaes_decrypt8 + // 6*8 bytes: storage for 5 callee-saved general-purpose registers + // 8*8 bytes: storage for 8 callee-saved SIMD registers + stp x29, x30, [sp, #-192]! + stp x19, x20, [sp, #80] + stp x21, x22, [sp, #96] + str x23, [sp, #112] + stp d8, d9, [sp, #128] + stp d10, d11, [sp, #144] + stp d12, d13, [sp, #160] + stp d14, d15, [sp, #176] + + mov x19, sp + mov x20, x0 + mov x21, x1 + mov x22, x2 + mov x23, x3 + + // generate initial tweak + sub sp, sp, #16 + mov x0, x5 // iv[] + mov x1, sp + mov x2, x4 // key2 + bl AES_encrypt + ldr q11, [sp], #16 + + ldr w1, [x23, #240] // get # of rounds + // allocate the key schedule on the stack + add x17, sp, #96 + sub x17, x17, x1, lsl #7 // 128 bytes per inner round key, less 96 bytes + + // populate the key schedule + mov x9, x23 // pass key + mov x10, x1 // pass # of rounds + mov sp, x17 + bl _bsaes_key_convert + ldr q6, [sp] + str q15, [x17] // save last round key + eor v6.16b, v6.16b, v7.16b // fix up round 0 key (by XORing with 0x63) + str q6, [sp] + + sub x30, x22, #0x10 + tst x22, #0xf // if not multiple of 16 + csel x22, x30, x22, ne // subtract another 16 bytes + subs x22, x22, #0x80 + + blo .Lxts_dec_short + b .Lxts_dec_loop + +.align 4 +.Lxts_dec_loop: + ldr q8, .Lxts_magic + mov x10, x1 // pass rounds + add x2, x19, #16 + ldr q0, [x20], #16 + sshr v1.2d, v11.2d, #63 + mov x9, sp // pass key schedule + ldr q6, .Lxts_magic+16 + add v2.2d, v11.2d, v11.2d + cmtst v3.2d, v11.2d, v6.2d + and v1.16b, v1.16b, v8.16b + ext v1.16b, v1.16b, v1.16b, #8 + and v3.16b, v3.16b, v8.16b + ldr q4, [x20], #16 + eor v12.16b, v2.16b, v1.16b + eor v1.16b, v4.16b, v12.16b + eor v0.16b, v0.16b, v11.16b + cmtst v2.2d, v12.2d, v6.2d + add v4.2d, v12.2d, v12.2d + add x0, x19, #16 + ext v3.16b, v3.16b, v3.16b, #8 + and v2.16b, v2.16b, v8.16b + eor v13.16b, v4.16b, v3.16b + ldr q3, [x20], #16 + ext v4.16b, v2.16b, v2.16b, #8 + eor v2.16b, v3.16b, v13.16b + ldr q3, [x20], #16 + add v5.2d, v13.2d, v13.2d + cmtst v7.2d, v13.2d, v6.2d + and v7.16b, v7.16b, v8.16b + ldr q9, [x20], #16 + ext v7.16b, v7.16b, v7.16b, #8 + ldr q10, [x20], #16 + eor v14.16b, v5.16b, v4.16b + ldr q16, [x20], #16 + add v4.2d, v14.2d, v14.2d + eor v3.16b, v3.16b, v14.16b + eor v15.16b, v4.16b, v7.16b + add v5.2d, v15.2d, v15.2d + ldr q7, [x20], #16 + cmtst v4.2d, v14.2d, v6.2d + and v17.16b, v4.16b, v8.16b + cmtst v18.2d, v15.2d, v6.2d + eor v4.16b, v9.16b, v15.16b + ext v9.16b, v17.16b, v17.16b, #8 + eor v9.16b, v5.16b, v9.16b + add v17.2d, v9.2d, v9.2d + and v18.16b, v18.16b, v8.16b + eor v5.16b, v10.16b, v9.16b + str q9, [x2], #16 + ext v10.16b, v18.16b, v18.16b, #8 + cmtst v9.2d, v9.2d, v6.2d + and v9.16b, v9.16b, v8.16b + eor v10.16b, v17.16b, v10.16b + cmtst v17.2d, v10.2d, v6.2d + eor v6.16b, v16.16b, v10.16b + str q10, [x2], #16 + ext v9.16b, v9.16b, v9.16b, #8 + add v10.2d, v10.2d, v10.2d + eor v9.16b, v10.16b, v9.16b + str q9, [x2], #16 + eor v7.16b, v7.16b, v9.16b + add v9.2d, v9.2d, v9.2d + and v8.16b, v17.16b, v8.16b + ext v8.16b, v8.16b, v8.16b, #8 + eor v8.16b, v9.16b, v8.16b + str q8, [x2] // next round tweak + + bl _bsaes_decrypt8 + + eor v6.16b, v6.16b, v13.16b + eor v0.16b, v0.16b, v11.16b + ldr q8, [x0], #16 + eor v7.16b, v7.16b, v8.16b + str q0, [x21], #16 + eor v0.16b, v1.16b, v12.16b + ldr q1, [x0], #16 + eor v1.16b, v3.16b, v1.16b + subs x22, x22, #0x80 + eor v2.16b, v2.16b, v15.16b + eor v3.16b, v4.16b, v14.16b + ldr q4, [x0], #16 + str q0, [x21], #16 + ldr q11, [x0] // next round tweak + eor v0.16b, v5.16b, v4.16b + str q6, [x21], #16 + str q3, [x21], #16 + str q2, [x21], #16 + str q7, [x21], #16 + str q1, [x21], #16 + str q0, [x21], #16 + bpl .Lxts_dec_loop + +.Lxts_dec_short: + adds x22, x22, #0x70 + bmi .Lxts_dec_done + + ldr q8, .Lxts_magic + sshr v1.2d, v11.2d, #63 + add v2.2d, v11.2d, v11.2d + ldr q9, .Lxts_magic+16 + subs x22, x22, #0x10 + ldr q0, [x20], #16 + and v1.16b, v1.16b, v8.16b + cmtst v3.2d, v11.2d, v9.2d + ext v1.16b, v1.16b, v1.16b, #8 + and v3.16b, v3.16b, v8.16b + eor v12.16b, v2.16b, v1.16b + ext v1.16b, v3.16b, v3.16b, #8 + add v2.2d, v12.2d, v12.2d + cmtst v3.2d, v12.2d, v9.2d + eor v13.16b, v2.16b, v1.16b + and v22.16b, v3.16b, v8.16b + bmi .Lxts_dec_1 + + ext v2.16b, v22.16b, v22.16b, #8 + add v3.2d, v13.2d, v13.2d + ldr q1, [x20], #16 + cmtst v4.2d, v13.2d, v9.2d + subs x22, x22, #0x10 + eor v14.16b, v3.16b, v2.16b + and v23.16b, v4.16b, v8.16b + bmi .Lxts_dec_2 + + ext v3.16b, v23.16b, v23.16b, #8 + add v4.2d, v14.2d, v14.2d + ldr q2, [x20], #16 + cmtst v5.2d, v14.2d, v9.2d + eor v0.16b, v0.16b, v11.16b + subs x22, x22, #0x10 + eor v15.16b, v4.16b, v3.16b + and v24.16b, v5.16b, v8.16b + bmi .Lxts_dec_3 + + ext v4.16b, v24.16b, v24.16b, #8 + add v5.2d, v15.2d, v15.2d + ldr q3, [x20], #16 + cmtst v6.2d, v15.2d, v9.2d + eor v1.16b, v1.16b, v12.16b + subs x22, x22, #0x10 + eor v16.16b, v5.16b, v4.16b + and v25.16b, v6.16b, v8.16b + bmi .Lxts_dec_4 + + ext v5.16b, v25.16b, v25.16b, #8 + add v6.2d, v16.2d, v16.2d + add x0, x19, #16 + cmtst v7.2d, v16.2d, v9.2d + ldr q4, [x20], #16 + eor v2.16b, v2.16b, v13.16b + str q16, [x0], #16 + subs x22, x22, #0x10 + eor v17.16b, v6.16b, v5.16b + and v26.16b, v7.16b, v8.16b + bmi .Lxts_dec_5 + + ext v7.16b, v26.16b, v26.16b, #8 + add v18.2d, v17.2d, v17.2d + ldr q5, [x20], #16 + eor v3.16b, v3.16b, v14.16b + str q17, [x0], #16 + subs x22, x22, #0x10 + eor v18.16b, v18.16b, v7.16b + bmi .Lxts_dec_6 + + ldr q6, [x20], #16 + eor v4.16b, v4.16b, v15.16b + eor v5.16b, v5.16b, v16.16b + str q18, [x0] // next round tweak + mov x9, sp // pass key schedule + mov x10, x1 + add x0, x19, #16 + sub x22, x22, #0x10 + eor v6.16b, v6.16b, v17.16b + + bl _bsaes_decrypt8 + + ldr q16, [x0], #16 + eor v0.16b, v0.16b, v11.16b + eor v1.16b, v1.16b, v12.16b + ldr q17, [x0], #16 + eor v6.16b, v6.16b, v13.16b + eor v4.16b, v4.16b, v14.16b + eor v2.16b, v2.16b, v15.16b + ldr q11, [x0] // next round tweak + str q0, [x21], #16 + str q1, [x21], #16 + eor v0.16b, v7.16b, v16.16b + eor v1.16b, v3.16b, v17.16b + str q6, [x21], #16 + str q4, [x21], #16 + str q2, [x21], #16 + str q0, [x21], #16 + str q1, [x21], #16 + b .Lxts_dec_done + +.align 4 +.Lxts_dec_6: + eor v4.16b, v4.16b, v15.16b + eor v5.16b, v5.16b, v16.16b + mov x9, sp // pass key schedule + mov x10, x1 // pass rounds + add x0, x19, #16 + + bl _bsaes_decrypt8 + + ldr q16, [x0], #16 + eor v0.16b, v0.16b, v11.16b + eor v1.16b, v1.16b, v12.16b + eor v6.16b, v6.16b, v13.16b + eor v4.16b, v4.16b, v14.16b + ldr q11, [x0] // next round tweak + eor v2.16b, v2.16b, v15.16b + str q0, [x21], #16 + str q1, [x21], #16 + eor v0.16b, v7.16b, v16.16b + str q6, [x21], #16 + str q4, [x21], #16 + str q2, [x21], #16 + str q0, [x21], #16 + b .Lxts_dec_done + +.align 4 +.Lxts_dec_5: + eor v3.16b, v3.16b, v14.16b + eor v4.16b, v4.16b, v15.16b + mov x9, sp // pass key schedule + mov x10, x1 // pass rounds + add x0, x19, #16 + + bl _bsaes_decrypt8 + + eor v0.16b, v0.16b, v11.16b + eor v1.16b, v1.16b, v12.16b + ldr q11, [x0] // next round tweak + eor v6.16b, v6.16b, v13.16b + eor v4.16b, v4.16b, v14.16b + eor v2.16b, v2.16b, v15.16b + str q0, [x21], #16 + str q1, [x21], #16 + str q6, [x21], #16 + str q4, [x21], #16 + str q2, [x21], #16 + b .Lxts_dec_done + +.align 4 +.Lxts_dec_4: + eor v2.16b, v2.16b, v13.16b + eor v3.16b, v3.16b, v14.16b + mov x9, sp // pass key schedule + mov x10, x1 // pass rounds + add x0, x19, #16 + + bl _bsaes_decrypt8 + + eor v0.16b, v0.16b, v11.16b + eor v1.16b, v1.16b, v12.16b + eor v6.16b, v6.16b, v13.16b + eor v4.16b, v4.16b, v14.16b + mov v11.16b, v15.16b // next round tweak + str q0, [x21], #16 + str q1, [x21], #16 + str q6, [x21], #16 + str q4, [x21], #16 + b .Lxts_dec_done + +.align 4 +.Lxts_dec_3: + eor v1.16b, v1.16b, v12.16b + eor v2.16b, v2.16b, v13.16b + mov x9, sp // pass key schedule + mov x10, x1 // pass rounds + add x0, x19, #16 + + bl _bsaes_decrypt8 + + eor v0.16b, v0.16b, v11.16b + eor v1.16b, v1.16b, v12.16b + eor v6.16b, v6.16b, v13.16b + mov v11.16b, v14.16b // next round tweak + str q0, [x21], #16 + str q1, [x21], #16 + str q6, [x21], #16 + b .Lxts_dec_done + +.align 4 +.Lxts_dec_2: + eor v0.16b, v0.16b, v11.16b + eor v1.16b, v1.16b, v12.16b + mov x9, sp // pass key schedule + mov x10, x1 // pass rounds + add x0, x19, #16 + + bl _bsaes_decrypt8 + + eor v0.16b, v0.16b, v11.16b + eor v1.16b, v1.16b, v12.16b + mov v11.16b, v13.16b // next round tweak + str q0, [x21], #16 + str q1, [x21], #16 + b .Lxts_dec_done + +.align 4 +.Lxts_dec_1: + eor v0.16b, v0.16b, v11.16b + sub x0, sp, #16 + sub x1, sp, #16 + mov x2, x23 + mov v13.d[0], v11.d[1] // just in case AES_decrypt corrupts top half of callee-saved SIMD registers + mov v14.d[0], v12.d[1] + str q0, [sp, #-16]! + + bl AES_decrypt + + ldr q0, [sp], #16 + trn1 v13.2d, v11.2d, v13.2d + trn1 v11.2d, v12.2d, v14.2d // next round tweak + eor v0.16b, v0.16b, v13.16b + str q0, [x21], #16 + +.Lxts_dec_done: + adds x22, x22, #0x10 + beq .Lxts_dec_ret + + // calculate one round of extra tweak for the stolen ciphertext + ldr q8, .Lxts_magic + sshr v6.2d, v11.2d, #63 + and v6.16b, v6.16b, v8.16b + add v12.2d, v11.2d, v11.2d + ext v6.16b, v6.16b, v6.16b, #8 + eor v12.16b, v12.16b, v6.16b + + // perform the final decryption with the last tweak value + ldr q0, [x20], #16 + eor v0.16b, v0.16b, v12.16b + str q0, [sp, #-16]! + mov x0, sp + mov x1, sp + mov x2, x23 + mov v13.d[0], v11.d[1] // just in case AES_decrypt corrupts top half of callee-saved SIMD registers + mov v14.d[0], v12.d[1] + + bl AES_decrypt + + trn1 v12.2d, v12.2d, v14.2d + trn1 v11.2d, v11.2d, v13.2d + ldr q0, [sp], #16 + eor v0.16b, v0.16b, v12.16b + str q0, [x21] + + mov x6, x21 + // Penultimate ciphertext block produces final plaintext part-block + // plus remaining part of final ciphertext block. Move plaintext part + // to final position and reuse penultimate plaintext block buffer to + // construct final ciphertext block +.Lxts_dec_steal: + ldrb w1, [x21] + ldrb w0, [x20], #1 + strb w1, [x21, #0x10] + strb w0, [x21], #1 + + subs x22, x22, #1 + bhi .Lxts_dec_steal + + // Finally decrypt the penultimate plaintext block using the + // penultimate tweak + ldr q0, [x6] + eor v0.16b, v0.16b, v11.16b + str q0, [sp, #-16]! + mov x0, sp + mov x1, sp + mov x2, x23 + mov x21, x6 + + bl AES_decrypt + + trn1 v11.2d, v11.2d, v13.2d + ldr q0, [sp], #16 + eor v0.16b, v0.16b, v11.16b + str q0, [x21] + +.Lxts_dec_ret: + + movi v0.16b, #0 + movi v1.16b, #0 +.Lxts_dec_bzero: // wipe key schedule + stp q0, q1, [sp], #32 + cmp sp, x19 + bne .Lxts_dec_bzero + + ldp x19, x20, [sp, #80] + ldp x21, x22, [sp, #96] + ldr x23, [sp, #112] + ldp d8, d9, [sp, #128] + ldp d10, d11, [sp, #144] + ldp d12, d13, [sp, #160] + ldp d14, d15, [sp, #176] + ldp x29, x30, [sp], #192 + ret +.size ossl_bsaes_xts_decrypt,.-ossl_bsaes_xts_decrypt diff --git a/sys/crypto/openssl/aarch64/chacha-armv8-sve.S b/sys/crypto/openssl/aarch64/chacha-armv8-sve.S new file mode 100644 index 000000000000..e595adf377f9 --- /dev/null +++ b/sys/crypto/openssl/aarch64/chacha-armv8-sve.S @@ -0,0 +1,3559 @@ +/* Do not modify. This file is auto-generated from chacha-armv8-sve.pl. */ +// Copyright 2022-2025 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License 2.0 (the "License"). You may not use +// this file except in compliance with the License. You can obtain a copy +// in the file LICENSE in the source distribution or at +// https://www.openssl.org/source/license.html +// +// +// ChaCha20 for ARMv8 via SVE +// +// $output is the last argument if it looks like a file (it has an extension) +// $flavour is the first argument if it doesn't look like a file +#include "arm_arch.h" + +.arch armv8-a + + +.hidden OPENSSL_armcap_P + +.text + +.section .rodata +.align 5 +.type _chacha_sve_consts,%object +_chacha_sve_consts: +.Lchacha20_consts: +.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral +.Lrot8: +.word 0x02010003,0x04040404,0x02010003,0x04040404 +.size _chacha_sve_consts,.-_chacha_sve_consts + +.previous + +.globl ChaCha20_ctr32_sve +.type ChaCha20_ctr32_sve,%function +.align 5 +ChaCha20_ctr32_sve: + AARCH64_VALID_CALL_TARGET +.inst 0x04a0e3e5 //cntw x5, ALL, MUL #1 + cmp x2,x5,lsl #6 + b.lt .Lreturn + mov x7,0 + adrp x6,OPENSSL_armcap_P + ldr w6,[x6,#:lo12:OPENSSL_armcap_P] + tst w6,#ARMV8_SVE2 + b.eq 1f + mov x7,1 + b 2f +1: + cmp x5,4 + b.le .Lreturn + adrp x6,.Lrot8 + add x6,x6,#:lo12:.Lrot8 + ldp w9,w10,[x6] +.inst 0x04aa4d3f //index z31.s,w9,w10 +2: + AARCH64_SIGN_LINK_REGISTER + stp d8,d9,[sp,-192]! + stp d10,d11,[sp,16] + stp d12,d13,[sp,32] + stp d14,d15,[sp,48] + stp x16,x17,[sp,64] + stp x18,x19,[sp,80] + stp x20,x21,[sp,96] + stp x22,x23,[sp,112] + stp x24,x25,[sp,128] + stp x26,x27,[sp,144] + stp x28,x29,[sp,160] + str x30,[sp,176] + + adrp x6,.Lchacha20_consts + add x6,x6,#:lo12:.Lchacha20_consts + ldp x23,x24,[x6] + ldp x25,x26,[x3] + ldp x27,x28,[x3, 16] + ldp x29,x30,[x4] +.inst 0x2599e3e0 //ptrues p0.s,ALL +#ifdef __AARCH64EB__ + ror x25,x25,#32 + ror x26,x26,#32 + ror x27,x27,#32 + ror x28,x28,#32 + ror x29,x29,#32 + ror x30,x30,#32 +#endif + cbz x7, 1f +.align 5 +100: + subs x7,x2,x5,lsl #6 + b.lt 110f + mov x2,x7 + b.eq 101f + cmp x2,64 + b.lt 101f + mixin=1 + lsr x8,x23,#32 +.inst 0x05a03ae0 //dup z0.s,w23 +.inst 0x05a03af9 //dup z25.s,w23 +.if mixin == 1 + mov w7,w23 +.endif +.inst 0x05a03904 //dup z4.s,w8 +.inst 0x05a0391a //dup z26.s,w8 + lsr x10,x24,#32 +.inst 0x05a03b08 //dup z8.s,w24 +.inst 0x05a03b1b //dup z27.s,w24 +.if mixin == 1 + mov w9,w24 +.endif +.inst 0x05a0394c //dup z12.s,w10 +.inst 0x05a0395c //dup z28.s,w10 + lsr x12,x25,#32 +.inst 0x05a03b21 //dup z1.s,w25 +.inst 0x05a03b3d //dup z29.s,w25 +.if mixin == 1 + mov w11,w25 +.endif +.inst 0x05a03985 //dup z5.s,w12 +.inst 0x05a0399e //dup z30.s,w12 + lsr x14,x26,#32 +.inst 0x05a03b49 //dup z9.s,w26 +.inst 0x05a03b55 //dup z21.s,w26 +.if mixin == 1 + mov w13,w26 +.endif +.inst 0x05a039cd //dup z13.s,w14 +.inst 0x05a039d6 //dup z22.s,w14 + lsr x16,x27,#32 +.inst 0x05a03b62 //dup z2.s,w27 +.inst 0x05a03b77 //dup z23.s,w27 +.if mixin == 1 + mov w15,w27 +.endif +.inst 0x05a03a06 //dup z6.s,w16 +.inst 0x05a03a18 //dup z24.s,w16 + lsr x18,x28,#32 +.inst 0x05a03b8a //dup z10.s,w28 +.inst 0x05a03b91 //dup z17.s,w28 +.if mixin == 1 + mov w17,w28 +.endif +.inst 0x05a03a4e //dup z14.s,w18 +.inst 0x05a03a52 //dup z18.s,w18 + lsr x22,x30,#32 +.inst 0x05a03bcb //dup z11.s,w30 +.inst 0x05a03bd4 //dup z20.s,w30 +.if mixin == 1 + mov w21,w30 +.endif +.inst 0x05a03acf //dup z15.s,w22 +.inst 0x05a03adf //dup z31.s,w22 +.if mixin == 1 + add w20,w29,#1 + mov w19,w29 +.inst 0x04a14690 //index z16.s,w20,1 +.inst 0x04a14683 //index z3.s,w20,1 +.else +.inst 0x04a147b0 //index z16.s,w29,1 +.inst 0x04a147a3 //index z3.s,w29,1 +.endif + lsr x20,x29,#32 +.inst 0x05a03a87 //dup z7.s,w20 +.inst 0x05a03a93 //dup z19.s,w20 + mov x6,#10 +10: +.align 5 +.inst 0x04a10000 //add z0.s,z0.s,z1.s +.if mixin == 1 + add w7,w7,w11 +.endif +.inst 0x04a50084 //add z4.s,z4.s,z5.s +.if mixin == 1 + add w8,w8,w12 +.endif +.inst 0x04a90108 //add z8.s,z8.s,z9.s +.if mixin == 1 + add w9,w9,w13 +.endif +.inst 0x04ad018c //add z12.s,z12.s,z13.s +.if mixin == 1 + add w10,w10,w14 +.endif +.if mixin == 1 + eor w19,w19,w7 +.endif +.inst 0x04703403 //xar z3.s,z3.s,z0.s,16 +.if mixin == 1 + ror w19,w19,16 +.endif +.if mixin == 1 + eor w20,w20,w8 +.endif +.inst 0x04703487 //xar z7.s,z7.s,z4.s,16 +.if mixin == 1 + ror w20,w20,16 +.endif +.if mixin == 1 + eor w21,w21,w9 +.endif +.inst 0x0470350b //xar z11.s,z11.s,z8.s,16 +.if mixin == 1 + ror w21,w21,16 +.endif +.if mixin == 1 + eor w22,w22,w10 +.endif +.inst 0x0470358f //xar z15.s,z15.s,z12.s,16 +.if mixin == 1 + ror w22,w22,16 +.endif +.inst 0x04a30042 //add z2.s,z2.s,z3.s +.if mixin == 1 + add w15,w15,w19 +.endif +.inst 0x04a700c6 //add z6.s,z6.s,z7.s +.if mixin == 1 + add w16,w16,w20 +.endif +.inst 0x04ab014a //add z10.s,z10.s,z11.s +.if mixin == 1 + add w17,w17,w21 +.endif +.inst 0x04af01ce //add z14.s,z14.s,z15.s +.if mixin == 1 + add w18,w18,w22 +.endif +.if mixin == 1 + eor w11,w11,w15 +.endif +.inst 0x046c3441 //xar z1.s,z1.s,z2.s,20 +.if mixin == 1 + ror w11,w11,20 +.endif +.if mixin == 1 + eor w12,w12,w16 +.endif +.inst 0x046c34c5 //xar z5.s,z5.s,z6.s,20 +.if mixin == 1 + ror w12,w12,20 +.endif +.if mixin == 1 + eor w13,w13,w17 +.endif +.inst 0x046c3549 //xar z9.s,z9.s,z10.s,20 +.if mixin == 1 + ror w13,w13,20 +.endif +.if mixin == 1 + eor w14,w14,w18 +.endif +.inst 0x046c35cd //xar z13.s,z13.s,z14.s,20 +.if mixin == 1 + ror w14,w14,20 +.endif +.inst 0x04a10000 //add z0.s,z0.s,z1.s +.if mixin == 1 + add w7,w7,w11 +.endif +.inst 0x04a50084 //add z4.s,z4.s,z5.s +.if mixin == 1 + add w8,w8,w12 +.endif +.inst 0x04a90108 //add z8.s,z8.s,z9.s +.if mixin == 1 + add w9,w9,w13 +.endif +.inst 0x04ad018c //add z12.s,z12.s,z13.s +.if mixin == 1 + add w10,w10,w14 +.endif +.if mixin == 1 + eor w19,w19,w7 +.endif +.inst 0x04683403 //xar z3.s,z3.s,z0.s,24 +.if mixin == 1 + ror w19,w19,24 +.endif +.if mixin == 1 + eor w20,w20,w8 +.endif +.inst 0x04683487 //xar z7.s,z7.s,z4.s,24 +.if mixin == 1 + ror w20,w20,24 +.endif +.if mixin == 1 + eor w21,w21,w9 +.endif +.inst 0x0468350b //xar z11.s,z11.s,z8.s,24 +.if mixin == 1 + ror w21,w21,24 +.endif +.if mixin == 1 + eor w22,w22,w10 +.endif +.inst 0x0468358f //xar z15.s,z15.s,z12.s,24 +.if mixin == 1 + ror w22,w22,24 +.endif +.inst 0x04a30042 //add z2.s,z2.s,z3.s +.if mixin == 1 + add w15,w15,w19 +.endif +.inst 0x04a700c6 //add z6.s,z6.s,z7.s +.if mixin == 1 + add w16,w16,w20 +.endif +.inst 0x04ab014a //add z10.s,z10.s,z11.s +.if mixin == 1 + add w17,w17,w21 +.endif +.inst 0x04af01ce //add z14.s,z14.s,z15.s +.if mixin == 1 + add w18,w18,w22 +.endif +.if mixin == 1 + eor w11,w11,w15 +.endif +.inst 0x04673441 //xar z1.s,z1.s,z2.s,25 +.if mixin == 1 + ror w11,w11,25 +.endif +.if mixin == 1 + eor w12,w12,w16 +.endif +.inst 0x046734c5 //xar z5.s,z5.s,z6.s,25 +.if mixin == 1 + ror w12,w12,25 +.endif +.if mixin == 1 + eor w13,w13,w17 +.endif +.inst 0x04673549 //xar z9.s,z9.s,z10.s,25 +.if mixin == 1 + ror w13,w13,25 +.endif +.if mixin == 1 + eor w14,w14,w18 +.endif +.inst 0x046735cd //xar z13.s,z13.s,z14.s,25 +.if mixin == 1 + ror w14,w14,25 +.endif +.inst 0x04a50000 //add z0.s,z0.s,z5.s +.if mixin == 1 + add w7,w7,w12 +.endif +.inst 0x04a90084 //add z4.s,z4.s,z9.s +.if mixin == 1 + add w8,w8,w13 +.endif +.inst 0x04ad0108 //add z8.s,z8.s,z13.s +.if mixin == 1 + add w9,w9,w14 +.endif +.inst 0x04a1018c //add z12.s,z12.s,z1.s +.if mixin == 1 + add w10,w10,w11 +.endif +.if mixin == 1 + eor w22,w22,w7 +.endif +.inst 0x0470340f //xar z15.s,z15.s,z0.s,16 +.if mixin == 1 + ror w22,w22,16 +.endif +.if mixin == 1 + eor w19,w19,w8 +.endif +.inst 0x04703483 //xar z3.s,z3.s,z4.s,16 +.if mixin == 1 + ror w19,w19,16 +.endif +.if mixin == 1 + eor w20,w20,w9 +.endif +.inst 0x04703507 //xar z7.s,z7.s,z8.s,16 +.if mixin == 1 + ror w20,w20,16 +.endif +.if mixin == 1 + eor w21,w21,w10 +.endif +.inst 0x0470358b //xar z11.s,z11.s,z12.s,16 +.if mixin == 1 + ror w21,w21,16 +.endif +.inst 0x04af014a //add z10.s,z10.s,z15.s +.if mixin == 1 + add w17,w17,w22 +.endif +.inst 0x04a301ce //add z14.s,z14.s,z3.s +.if mixin == 1 + add w18,w18,w19 +.endif +.inst 0x04a70042 //add z2.s,z2.s,z7.s +.if mixin == 1 + add w15,w15,w20 +.endif +.inst 0x04ab00c6 //add z6.s,z6.s,z11.s +.if mixin == 1 + add w16,w16,w21 +.endif +.if mixin == 1 + eor w12,w12,w17 +.endif +.inst 0x046c3545 //xar z5.s,z5.s,z10.s,20 +.if mixin == 1 + ror w12,w12,20 +.endif +.if mixin == 1 + eor w13,w13,w18 +.endif +.inst 0x046c35c9 //xar z9.s,z9.s,z14.s,20 +.if mixin == 1 + ror w13,w13,20 +.endif +.if mixin == 1 + eor w14,w14,w15 +.endif +.inst 0x046c344d //xar z13.s,z13.s,z2.s,20 +.if mixin == 1 + ror w14,w14,20 +.endif +.if mixin == 1 + eor w11,w11,w16 +.endif +.inst 0x046c34c1 //xar z1.s,z1.s,z6.s,20 +.if mixin == 1 + ror w11,w11,20 +.endif +.inst 0x04a50000 //add z0.s,z0.s,z5.s +.if mixin == 1 + add w7,w7,w12 +.endif +.inst 0x04a90084 //add z4.s,z4.s,z9.s +.if mixin == 1 + add w8,w8,w13 +.endif +.inst 0x04ad0108 //add z8.s,z8.s,z13.s +.if mixin == 1 + add w9,w9,w14 +.endif +.inst 0x04a1018c //add z12.s,z12.s,z1.s +.if mixin == 1 + add w10,w10,w11 +.endif +.if mixin == 1 + eor w22,w22,w7 +.endif +.inst 0x0468340f //xar z15.s,z15.s,z0.s,24 +.if mixin == 1 + ror w22,w22,24 +.endif +.if mixin == 1 + eor w19,w19,w8 +.endif +.inst 0x04683483 //xar z3.s,z3.s,z4.s,24 +.if mixin == 1 + ror w19,w19,24 +.endif +.if mixin == 1 + eor w20,w20,w9 +.endif +.inst 0x04683507 //xar z7.s,z7.s,z8.s,24 +.if mixin == 1 + ror w20,w20,24 +.endif +.if mixin == 1 + eor w21,w21,w10 +.endif +.inst 0x0468358b //xar z11.s,z11.s,z12.s,24 +.if mixin == 1 + ror w21,w21,24 +.endif +.inst 0x04af014a //add z10.s,z10.s,z15.s +.if mixin == 1 + add w17,w17,w22 +.endif +.inst 0x04a301ce //add z14.s,z14.s,z3.s +.if mixin == 1 + add w18,w18,w19 +.endif +.inst 0x04a70042 //add z2.s,z2.s,z7.s +.if mixin == 1 + add w15,w15,w20 +.endif +.inst 0x04ab00c6 //add z6.s,z6.s,z11.s +.if mixin == 1 + add w16,w16,w21 +.endif +.if mixin == 1 + eor w12,w12,w17 +.endif +.inst 0x04673545 //xar z5.s,z5.s,z10.s,25 +.if mixin == 1 + ror w12,w12,25 +.endif +.if mixin == 1 + eor w13,w13,w18 +.endif +.inst 0x046735c9 //xar z9.s,z9.s,z14.s,25 +.if mixin == 1 + ror w13,w13,25 +.endif +.if mixin == 1 + eor w14,w14,w15 +.endif +.inst 0x0467344d //xar z13.s,z13.s,z2.s,25 +.if mixin == 1 + ror w14,w14,25 +.endif +.if mixin == 1 + eor w11,w11,w16 +.endif +.inst 0x046734c1 //xar z1.s,z1.s,z6.s,25 +.if mixin == 1 + ror w11,w11,25 +.endif + sub x6,x6,1 + cbnz x6,10b +.if mixin == 1 + add w7,w7,w23 +.endif +.inst 0x04b90000 //add z0.s,z0.s,z25.s +.if mixin == 1 + add x8,x8,x23,lsr #32 +.endif +.inst 0x04ba0084 //add z4.s,z4.s,z26.s +.if mixin == 1 + add x7,x7,x8,lsl #32 // pack +.endif +.if mixin == 1 + add w9,w9,w24 +.endif +.inst 0x04bb0108 //add z8.s,z8.s,z27.s +.if mixin == 1 + add x10,x10,x24,lsr #32 +.endif +.inst 0x04bc018c //add z12.s,z12.s,z28.s +.if mixin == 1 + add x9,x9,x10,lsl #32 // pack +.endif +.if mixin == 1 + ldp x8,x10,[x1],#16 +.endif +.if mixin == 1 + add w11,w11,w25 +.endif +.inst 0x04bd0021 //add z1.s,z1.s,z29.s +.if mixin == 1 + add x12,x12,x25,lsr #32 +.endif +.inst 0x04be00a5 //add z5.s,z5.s,z30.s +.if mixin == 1 + add x11,x11,x12,lsl #32 // pack +.endif +.if mixin == 1 + add w13,w13,w26 +.endif +.inst 0x04b50129 //add z9.s,z9.s,z21.s +.if mixin == 1 + add x14,x14,x26,lsr #32 +.endif +.inst 0x04b601ad //add z13.s,z13.s,z22.s +.if mixin == 1 + add x13,x13,x14,lsl #32 // pack +.endif +.if mixin == 1 + ldp x12,x14,[x1],#16 +.endif +.if mixin == 1 + add w15,w15,w27 +.endif +.inst 0x04b70042 //add z2.s,z2.s,z23.s +.if mixin == 1 + add x16,x16,x27,lsr #32 +.endif +.inst 0x04b800c6 //add z6.s,z6.s,z24.s +.if mixin == 1 + add x15,x15,x16,lsl #32 // pack +.endif +.if mixin == 1 + add w17,w17,w28 +.endif +.inst 0x04b1014a //add z10.s,z10.s,z17.s +.if mixin == 1 + add x18,x18,x28,lsr #32 +.endif +.inst 0x04b201ce //add z14.s,z14.s,z18.s +.if mixin == 1 + add x17,x17,x18,lsl #32 // pack +.endif +.if mixin == 1 + ldp x16,x18,[x1],#16 +.endif +.if mixin == 1 + add w19,w19,w29 +.endif +.inst 0x04b00063 //add z3.s,z3.s,z16.s +.if mixin == 1 + add x20,x20,x29,lsr #32 +.endif +.inst 0x04b300e7 //add z7.s,z7.s,z19.s +.if mixin == 1 + add x19,x19,x20,lsl #32 // pack +.endif +.if mixin == 1 + add w21,w21,w30 +.endif +.inst 0x04b4016b //add z11.s,z11.s,z20.s +.if mixin == 1 + add x22,x22,x30,lsr #32 +.endif +.inst 0x04bf01ef //add z15.s,z15.s,z31.s +.if mixin == 1 + add x21,x21,x22,lsl #32 // pack +.endif +.if mixin == 1 + ldp x20,x22,[x1],#16 +.endif +#ifdef __AARCH64EB__ + rev x7,x7 +.inst 0x05a48000 //revb z0.s,p0/m,z0.s +.inst 0x05a48084 //revb z4.s,p0/m,z4.s + rev x9,x9 +.inst 0x05a48108 //revb z8.s,p0/m,z8.s +.inst 0x05a4818c //revb z12.s,p0/m,z12.s + rev x11,x11 +.inst 0x05a48021 //revb z1.s,p0/m,z1.s +.inst 0x05a480a5 //revb z5.s,p0/m,z5.s + rev x13,x13 +.inst 0x05a48129 //revb z9.s,p0/m,z9.s +.inst 0x05a481ad //revb z13.s,p0/m,z13.s + rev x15,x15 +.inst 0x05a48042 //revb z2.s,p0/m,z2.s +.inst 0x05a480c6 //revb z6.s,p0/m,z6.s + rev x17,x17 +.inst 0x05a4814a //revb z10.s,p0/m,z10.s +.inst 0x05a481ce //revb z14.s,p0/m,z14.s + rev x19,x19 +.inst 0x05a48063 //revb z3.s,p0/m,z3.s +.inst 0x05a480e7 //revb z7.s,p0/m,z7.s + rev x21,x21 +.inst 0x05a4816b //revb z11.s,p0/m,z11.s +.inst 0x05a481ef //revb z15.s,p0/m,z15.s +#endif +.if mixin == 1 + add x29,x29,#1 +.endif + cmp x5,4 + b.ne 200f +.if mixin == 1 + eor x7,x7,x8 +.endif +.if mixin == 1 + eor x9,x9,x10 +.endif +.if mixin == 1 + eor x11,x11,x12 +.endif +.inst 0x05a46011 //zip1 z17.s,z0.s,z4.s +.inst 0x05a46412 //zip2 z18.s,z0.s,z4.s +.inst 0x05ac6113 //zip1 z19.s,z8.s,z12.s +.inst 0x05ac6514 //zip2 z20.s,z8.s,z12.s + +.inst 0x05a56035 //zip1 z21.s,z1.s,z5.s +.inst 0x05a56436 //zip2 z22.s,z1.s,z5.s +.inst 0x05ad6137 //zip1 z23.s,z9.s,z13.s +.inst 0x05ad6538 //zip2 z24.s,z9.s,z13.s + +.inst 0x05f36220 //zip1 z0.d,z17.d,z19.d +.inst 0x05f36624 //zip2 z4.d,z17.d,z19.d +.inst 0x05f46248 //zip1 z8.d,z18.d,z20.d +.inst 0x05f4664c //zip2 z12.d,z18.d,z20.d + +.inst 0x05f762a1 //zip1 z1.d,z21.d,z23.d +.inst 0x05f766a5 //zip2 z5.d,z21.d,z23.d +.inst 0x05f862c9 //zip1 z9.d,z22.d,z24.d +.inst 0x05f866cd //zip2 z13.d,z22.d,z24.d +.if mixin == 1 + eor x13,x13,x14 +.endif +.if mixin == 1 + eor x15,x15,x16 +.endif +.if mixin == 1 + eor x17,x17,x18 +.endif +.inst 0x05a66051 //zip1 z17.s,z2.s,z6.s +.inst 0x05a66452 //zip2 z18.s,z2.s,z6.s +.inst 0x05ae6153 //zip1 z19.s,z10.s,z14.s +.inst 0x05ae6554 //zip2 z20.s,z10.s,z14.s + +.inst 0x05a76075 //zip1 z21.s,z3.s,z7.s +.inst 0x05a76476 //zip2 z22.s,z3.s,z7.s +.inst 0x05af6177 //zip1 z23.s,z11.s,z15.s +.inst 0x05af6578 //zip2 z24.s,z11.s,z15.s + +.inst 0x05f36222 //zip1 z2.d,z17.d,z19.d +.inst 0x05f36626 //zip2 z6.d,z17.d,z19.d +.inst 0x05f4624a //zip1 z10.d,z18.d,z20.d +.inst 0x05f4664e //zip2 z14.d,z18.d,z20.d + +.inst 0x05f762a3 //zip1 z3.d,z21.d,z23.d +.inst 0x05f766a7 //zip2 z7.d,z21.d,z23.d +.inst 0x05f862cb //zip1 z11.d,z22.d,z24.d +.inst 0x05f866cf //zip2 z15.d,z22.d,z24.d +.if mixin == 1 + eor x19,x19,x20 +.endif +.if mixin == 1 + eor x21,x21,x22 +.endif + ld1 {v17.4s,v18.4s,v19.4s,v20.4s},[x1],#64 + ld1 {v21.4s,v22.4s,v23.4s,v24.4s},[x1],#64 +.inst 0x04b13000 //eor z0.d,z0.d,z17.d +.inst 0x04b23021 //eor z1.d,z1.d,z18.d +.inst 0x04b33042 //eor z2.d,z2.d,z19.d +.inst 0x04b43063 //eor z3.d,z3.d,z20.d +.inst 0x04b53084 //eor z4.d,z4.d,z21.d +.inst 0x04b630a5 //eor z5.d,z5.d,z22.d +.inst 0x04b730c6 //eor z6.d,z6.d,z23.d +.inst 0x04b830e7 //eor z7.d,z7.d,z24.d + ld1 {v17.4s,v18.4s,v19.4s,v20.4s},[x1],#64 + ld1 {v21.4s,v22.4s,v23.4s,v24.4s},[x1],#64 +.if mixin == 1 + stp x7,x9,[x0],#16 +.endif +.inst 0x04b13108 //eor z8.d,z8.d,z17.d +.inst 0x04b23129 //eor z9.d,z9.d,z18.d +.if mixin == 1 + stp x11,x13,[x0],#16 +.endif +.inst 0x04b3314a //eor z10.d,z10.d,z19.d +.inst 0x04b4316b //eor z11.d,z11.d,z20.d +.if mixin == 1 + stp x15,x17,[x0],#16 +.endif +.inst 0x04b5318c //eor z12.d,z12.d,z21.d +.inst 0x04b631ad //eor z13.d,z13.d,z22.d +.if mixin == 1 + stp x19,x21,[x0],#16 +.endif +.inst 0x04b731ce //eor z14.d,z14.d,z23.d +.inst 0x04b831ef //eor z15.d,z15.d,z24.d + st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x0],#64 + st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64 + st1 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64 + st1 {v12.4s,v13.4s,v14.4s,v15.4s},[x0],#64 + b 210f +200: +.inst 0x05a16011 //zip1 z17.s,z0.s,z1.s +.inst 0x05a16412 //zip2 z18.s,z0.s,z1.s +.inst 0x05a36053 //zip1 z19.s,z2.s,z3.s +.inst 0x05a36454 //zip2 z20.s,z2.s,z3.s + +.inst 0x05a56095 //zip1 z21.s,z4.s,z5.s +.inst 0x05a56496 //zip2 z22.s,z4.s,z5.s +.inst 0x05a760d7 //zip1 z23.s,z6.s,z7.s +.inst 0x05a764d8 //zip2 z24.s,z6.s,z7.s + +.inst 0x05f36220 //zip1 z0.d,z17.d,z19.d +.inst 0x05f36621 //zip2 z1.d,z17.d,z19.d +.inst 0x05f46242 //zip1 z2.d,z18.d,z20.d +.inst 0x05f46643 //zip2 z3.d,z18.d,z20.d + +.inst 0x05f762a4 //zip1 z4.d,z21.d,z23.d +.inst 0x05f766a5 //zip2 z5.d,z21.d,z23.d +.inst 0x05f862c6 //zip1 z6.d,z22.d,z24.d +.inst 0x05f866c7 //zip2 z7.d,z22.d,z24.d +.if mixin == 1 + eor x7,x7,x8 +.endif +.if mixin == 1 + eor x9,x9,x10 +.endif +.inst 0x05a96111 //zip1 z17.s,z8.s,z9.s +.inst 0x05a96512 //zip2 z18.s,z8.s,z9.s +.inst 0x05ab6153 //zip1 z19.s,z10.s,z11.s +.inst 0x05ab6554 //zip2 z20.s,z10.s,z11.s + +.inst 0x05ad6195 //zip1 z21.s,z12.s,z13.s +.inst 0x05ad6596 //zip2 z22.s,z12.s,z13.s +.inst 0x05af61d7 //zip1 z23.s,z14.s,z15.s +.inst 0x05af65d8 //zip2 z24.s,z14.s,z15.s + +.inst 0x05f36228 //zip1 z8.d,z17.d,z19.d +.inst 0x05f36629 //zip2 z9.d,z17.d,z19.d +.inst 0x05f4624a //zip1 z10.d,z18.d,z20.d +.inst 0x05f4664b //zip2 z11.d,z18.d,z20.d + +.inst 0x05f762ac //zip1 z12.d,z21.d,z23.d +.inst 0x05f766ad //zip2 z13.d,z21.d,z23.d +.inst 0x05f862ce //zip1 z14.d,z22.d,z24.d +.inst 0x05f866cf //zip2 z15.d,z22.d,z24.d +.if mixin == 1 + eor x11,x11,x12 +.endif +.if mixin == 1 + eor x13,x13,x14 +.endif +.inst 0x05a46011 //zip1 z17.s,z0.s,z4.s +.inst 0x05a46412 //zip2 z18.s,z0.s,z4.s +.inst 0x05ac6113 //zip1 z19.s,z8.s,z12.s +.inst 0x05ac6514 //zip2 z20.s,z8.s,z12.s + +.inst 0x05a56035 //zip1 z21.s,z1.s,z5.s +.inst 0x05a56436 //zip2 z22.s,z1.s,z5.s +.inst 0x05ad6137 //zip1 z23.s,z9.s,z13.s +.inst 0x05ad6538 //zip2 z24.s,z9.s,z13.s + +.inst 0x05f36220 //zip1 z0.d,z17.d,z19.d +.inst 0x05f36624 //zip2 z4.d,z17.d,z19.d +.inst 0x05f46248 //zip1 z8.d,z18.d,z20.d +.inst 0x05f4664c //zip2 z12.d,z18.d,z20.d + +.inst 0x05f762a1 //zip1 z1.d,z21.d,z23.d +.inst 0x05f766a5 //zip2 z5.d,z21.d,z23.d +.inst 0x05f862c9 //zip1 z9.d,z22.d,z24.d +.inst 0x05f866cd //zip2 z13.d,z22.d,z24.d +.if mixin == 1 + eor x15,x15,x16 +.endif +.if mixin == 1 + eor x17,x17,x18 +.endif +.inst 0x05a66051 //zip1 z17.s,z2.s,z6.s +.inst 0x05a66452 //zip2 z18.s,z2.s,z6.s +.inst 0x05ae6153 //zip1 z19.s,z10.s,z14.s +.inst 0x05ae6554 //zip2 z20.s,z10.s,z14.s + +.inst 0x05a76075 //zip1 z21.s,z3.s,z7.s +.inst 0x05a76476 //zip2 z22.s,z3.s,z7.s +.inst 0x05af6177 //zip1 z23.s,z11.s,z15.s +.inst 0x05af6578 //zip2 z24.s,z11.s,z15.s + +.inst 0x05f36222 //zip1 z2.d,z17.d,z19.d +.inst 0x05f36626 //zip2 z6.d,z17.d,z19.d +.inst 0x05f4624a //zip1 z10.d,z18.d,z20.d +.inst 0x05f4664e //zip2 z14.d,z18.d,z20.d + +.inst 0x05f762a3 //zip1 z3.d,z21.d,z23.d +.inst 0x05f766a7 //zip2 z7.d,z21.d,z23.d +.inst 0x05f862cb //zip1 z11.d,z22.d,z24.d +.inst 0x05f866cf //zip2 z15.d,z22.d,z24.d +.if mixin == 1 + eor x19,x19,x20 +.endif +.if mixin == 1 + eor x21,x21,x22 +.endif +.inst 0xa540a031 //ld1w {z17.s},p0/z,[x1,#0,MUL VL] +.inst 0xa541a032 //ld1w {z18.s},p0/z,[x1,#1,MUL VL] +.inst 0xa542a033 //ld1w {z19.s},p0/z,[x1,#2,MUL VL] +.inst 0xa543a034 //ld1w {z20.s},p0/z,[x1,#3,MUL VL] +.inst 0xa544a035 //ld1w {z21.s},p0/z,[x1,#4,MUL VL] +.inst 0xa545a036 //ld1w {z22.s},p0/z,[x1,#5,MUL VL] +.inst 0xa546a037 //ld1w {z23.s},p0/z,[x1,#6,MUL VL] +.inst 0xa547a038 //ld1w {z24.s},p0/z,[x1,#7,MUL VL] +.inst 0x04215101 //addvl x1,x1,8 +.inst 0x04b13000 //eor z0.d,z0.d,z17.d +.inst 0x04b23084 //eor z4.d,z4.d,z18.d +.inst 0x04b33108 //eor z8.d,z8.d,z19.d +.inst 0x04b4318c //eor z12.d,z12.d,z20.d +.inst 0x04b53021 //eor z1.d,z1.d,z21.d +.inst 0x04b630a5 //eor z5.d,z5.d,z22.d +.inst 0x04b73129 //eor z9.d,z9.d,z23.d +.inst 0x04b831ad //eor z13.d,z13.d,z24.d +.inst 0xa540a031 //ld1w {z17.s},p0/z,[x1,#0,MUL VL] +.inst 0xa541a032 //ld1w {z18.s},p0/z,[x1,#1,MUL VL] +.inst 0xa542a033 //ld1w {z19.s},p0/z,[x1,#2,MUL VL] +.inst 0xa543a034 //ld1w {z20.s},p0/z,[x1,#3,MUL VL] +.inst 0xa544a035 //ld1w {z21.s},p0/z,[x1,#4,MUL VL] +.inst 0xa545a036 //ld1w {z22.s},p0/z,[x1,#5,MUL VL] +.inst 0xa546a037 //ld1w {z23.s},p0/z,[x1,#6,MUL VL] +.inst 0xa547a038 //ld1w {z24.s},p0/z,[x1,#7,MUL VL] +.inst 0x04215101 //addvl x1,x1,8 +.if mixin == 1 + stp x7,x9,[x0],#16 +.endif +.inst 0x04b13042 //eor z2.d,z2.d,z17.d +.inst 0x04b230c6 //eor z6.d,z6.d,z18.d +.if mixin == 1 + stp x11,x13,[x0],#16 +.endif +.inst 0x04b3314a //eor z10.d,z10.d,z19.d +.inst 0x04b431ce //eor z14.d,z14.d,z20.d +.if mixin == 1 + stp x15,x17,[x0],#16 +.endif +.inst 0x04b53063 //eor z3.d,z3.d,z21.d +.inst 0x04b630e7 //eor z7.d,z7.d,z22.d +.if mixin == 1 + stp x19,x21,[x0],#16 +.endif +.inst 0x04b7316b //eor z11.d,z11.d,z23.d +.inst 0x04b831ef //eor z15.d,z15.d,z24.d +.inst 0xe540e000 //st1w {z0.s},p0,[x0,#0,MUL VL] +.inst 0xe541e004 //st1w {z4.s},p0,[x0,#1,MUL VL] +.inst 0xe542e008 //st1w {z8.s},p0,[x0,#2,MUL VL] +.inst 0xe543e00c //st1w {z12.s},p0,[x0,#3,MUL VL] +.inst 0xe544e001 //st1w {z1.s},p0,[x0,#4,MUL VL] +.inst 0xe545e005 //st1w {z5.s},p0,[x0,#5,MUL VL] +.inst 0xe546e009 //st1w {z9.s},p0,[x0,#6,MUL VL] +.inst 0xe547e00d //st1w {z13.s},p0,[x0,#7,MUL VL] +.inst 0x04205100 //addvl x0,x0,8 +.inst 0xe540e002 //st1w {z2.s},p0,[x0,#0,MUL VL] +.inst 0xe541e006 //st1w {z6.s},p0,[x0,#1,MUL VL] +.inst 0xe542e00a //st1w {z10.s},p0,[x0,#2,MUL VL] +.inst 0xe543e00e //st1w {z14.s},p0,[x0,#3,MUL VL] +.inst 0xe544e003 //st1w {z3.s},p0,[x0,#4,MUL VL] +.inst 0xe545e007 //st1w {z7.s},p0,[x0,#5,MUL VL] +.inst 0xe546e00b //st1w {z11.s},p0,[x0,#6,MUL VL] +.inst 0xe547e00f //st1w {z15.s},p0,[x0,#7,MUL VL] +.inst 0x04205100 //addvl x0,x0,8 +210: +.inst 0x04b0e3fd //incw x29, ALL, MUL #1 + subs x2,x2,64 + b.gt 100b + b 110f +101: + mixin=0 + lsr x8,x23,#32 +.inst 0x05a03ae0 //dup z0.s,w23 +.inst 0x05a03af9 //dup z25.s,w23 +.if mixin == 1 + mov w7,w23 +.endif +.inst 0x05a03904 //dup z4.s,w8 +.inst 0x05a0391a //dup z26.s,w8 + lsr x10,x24,#32 +.inst 0x05a03b08 //dup z8.s,w24 +.inst 0x05a03b1b //dup z27.s,w24 +.if mixin == 1 + mov w9,w24 +.endif +.inst 0x05a0394c //dup z12.s,w10 +.inst 0x05a0395c //dup z28.s,w10 + lsr x12,x25,#32 +.inst 0x05a03b21 //dup z1.s,w25 +.inst 0x05a03b3d //dup z29.s,w25 +.if mixin == 1 + mov w11,w25 +.endif +.inst 0x05a03985 //dup z5.s,w12 +.inst 0x05a0399e //dup z30.s,w12 + lsr x14,x26,#32 +.inst 0x05a03b49 //dup z9.s,w26 +.inst 0x05a03b55 //dup z21.s,w26 +.if mixin == 1 + mov w13,w26 +.endif +.inst 0x05a039cd //dup z13.s,w14 +.inst 0x05a039d6 //dup z22.s,w14 + lsr x16,x27,#32 +.inst 0x05a03b62 //dup z2.s,w27 +.inst 0x05a03b77 //dup z23.s,w27 +.if mixin == 1 + mov w15,w27 +.endif +.inst 0x05a03a06 //dup z6.s,w16 +.inst 0x05a03a18 //dup z24.s,w16 + lsr x18,x28,#32 +.inst 0x05a03b8a //dup z10.s,w28 +.inst 0x05a03b91 //dup z17.s,w28 +.if mixin == 1 + mov w17,w28 +.endif +.inst 0x05a03a4e //dup z14.s,w18 +.inst 0x05a03a52 //dup z18.s,w18 + lsr x22,x30,#32 +.inst 0x05a03bcb //dup z11.s,w30 +.inst 0x05a03bd4 //dup z20.s,w30 +.if mixin == 1 + mov w21,w30 +.endif +.inst 0x05a03acf //dup z15.s,w22 +.inst 0x05a03adf //dup z31.s,w22 +.if mixin == 1 + add w20,w29,#1 + mov w19,w29 +.inst 0x04a14690 //index z16.s,w20,1 +.inst 0x04a14683 //index z3.s,w20,1 +.else +.inst 0x04a147b0 //index z16.s,w29,1 +.inst 0x04a147a3 //index z3.s,w29,1 +.endif + lsr x20,x29,#32 +.inst 0x05a03a87 //dup z7.s,w20 +.inst 0x05a03a93 //dup z19.s,w20 + mov x6,#10 +10: +.align 5 +.inst 0x04a10000 //add z0.s,z0.s,z1.s +.if mixin == 1 + add w7,w7,w11 +.endif +.inst 0x04a50084 //add z4.s,z4.s,z5.s +.if mixin == 1 + add w8,w8,w12 +.endif +.inst 0x04a90108 //add z8.s,z8.s,z9.s +.if mixin == 1 + add w9,w9,w13 +.endif +.inst 0x04ad018c //add z12.s,z12.s,z13.s +.if mixin == 1 + add w10,w10,w14 +.endif +.if mixin == 1 + eor w19,w19,w7 +.endif +.inst 0x04703403 //xar z3.s,z3.s,z0.s,16 +.if mixin == 1 + ror w19,w19,16 +.endif +.if mixin == 1 + eor w20,w20,w8 +.endif +.inst 0x04703487 //xar z7.s,z7.s,z4.s,16 +.if mixin == 1 + ror w20,w20,16 +.endif +.if mixin == 1 + eor w21,w21,w9 +.endif +.inst 0x0470350b //xar z11.s,z11.s,z8.s,16 +.if mixin == 1 + ror w21,w21,16 +.endif +.if mixin == 1 + eor w22,w22,w10 +.endif +.inst 0x0470358f //xar z15.s,z15.s,z12.s,16 +.if mixin == 1 + ror w22,w22,16 +.endif +.inst 0x04a30042 //add z2.s,z2.s,z3.s +.if mixin == 1 + add w15,w15,w19 +.endif +.inst 0x04a700c6 //add z6.s,z6.s,z7.s +.if mixin == 1 + add w16,w16,w20 +.endif +.inst 0x04ab014a //add z10.s,z10.s,z11.s +.if mixin == 1 + add w17,w17,w21 +.endif +.inst 0x04af01ce //add z14.s,z14.s,z15.s +.if mixin == 1 + add w18,w18,w22 +.endif +.if mixin == 1 + eor w11,w11,w15 +.endif +.inst 0x046c3441 //xar z1.s,z1.s,z2.s,20 +.if mixin == 1 + ror w11,w11,20 +.endif +.if mixin == 1 + eor w12,w12,w16 +.endif +.inst 0x046c34c5 //xar z5.s,z5.s,z6.s,20 +.if mixin == 1 + ror w12,w12,20 +.endif +.if mixin == 1 + eor w13,w13,w17 +.endif +.inst 0x046c3549 //xar z9.s,z9.s,z10.s,20 +.if mixin == 1 + ror w13,w13,20 +.endif +.if mixin == 1 + eor w14,w14,w18 +.endif +.inst 0x046c35cd //xar z13.s,z13.s,z14.s,20 +.if mixin == 1 + ror w14,w14,20 +.endif +.inst 0x04a10000 //add z0.s,z0.s,z1.s +.if mixin == 1 + add w7,w7,w11 +.endif +.inst 0x04a50084 //add z4.s,z4.s,z5.s +.if mixin == 1 + add w8,w8,w12 +.endif +.inst 0x04a90108 //add z8.s,z8.s,z9.s +.if mixin == 1 + add w9,w9,w13 +.endif +.inst 0x04ad018c //add z12.s,z12.s,z13.s +.if mixin == 1 + add w10,w10,w14 +.endif +.if mixin == 1 + eor w19,w19,w7 +.endif +.inst 0x04683403 //xar z3.s,z3.s,z0.s,24 +.if mixin == 1 + ror w19,w19,24 +.endif +.if mixin == 1 + eor w20,w20,w8 +.endif +.inst 0x04683487 //xar z7.s,z7.s,z4.s,24 +.if mixin == 1 + ror w20,w20,24 +.endif +.if mixin == 1 + eor w21,w21,w9 +.endif +.inst 0x0468350b //xar z11.s,z11.s,z8.s,24 +.if mixin == 1 + ror w21,w21,24 +.endif +.if mixin == 1 + eor w22,w22,w10 +.endif +.inst 0x0468358f //xar z15.s,z15.s,z12.s,24 +.if mixin == 1 + ror w22,w22,24 +.endif +.inst 0x04a30042 //add z2.s,z2.s,z3.s +.if mixin == 1 + add w15,w15,w19 +.endif +.inst 0x04a700c6 //add z6.s,z6.s,z7.s +.if mixin == 1 + add w16,w16,w20 +.endif +.inst 0x04ab014a //add z10.s,z10.s,z11.s +.if mixin == 1 + add w17,w17,w21 +.endif +.inst 0x04af01ce //add z14.s,z14.s,z15.s +.if mixin == 1 + add w18,w18,w22 +.endif +.if mixin == 1 + eor w11,w11,w15 +.endif +.inst 0x04673441 //xar z1.s,z1.s,z2.s,25 +.if mixin == 1 + ror w11,w11,25 +.endif +.if mixin == 1 + eor w12,w12,w16 +.endif +.inst 0x046734c5 //xar z5.s,z5.s,z6.s,25 +.if mixin == 1 + ror w12,w12,25 +.endif +.if mixin == 1 + eor w13,w13,w17 +.endif +.inst 0x04673549 //xar z9.s,z9.s,z10.s,25 +.if mixin == 1 + ror w13,w13,25 +.endif +.if mixin == 1 + eor w14,w14,w18 +.endif +.inst 0x046735cd //xar z13.s,z13.s,z14.s,25 +.if mixin == 1 + ror w14,w14,25 +.endif +.inst 0x04a50000 //add z0.s,z0.s,z5.s +.if mixin == 1 + add w7,w7,w12 +.endif +.inst 0x04a90084 //add z4.s,z4.s,z9.s +.if mixin == 1 + add w8,w8,w13 +.endif +.inst 0x04ad0108 //add z8.s,z8.s,z13.s +.if mixin == 1 + add w9,w9,w14 +.endif +.inst 0x04a1018c //add z12.s,z12.s,z1.s +.if mixin == 1 + add w10,w10,w11 +.endif +.if mixin == 1 + eor w22,w22,w7 +.endif +.inst 0x0470340f //xar z15.s,z15.s,z0.s,16 +.if mixin == 1 + ror w22,w22,16 +.endif +.if mixin == 1 + eor w19,w19,w8 +.endif +.inst 0x04703483 //xar z3.s,z3.s,z4.s,16 +.if mixin == 1 + ror w19,w19,16 +.endif +.if mixin == 1 + eor w20,w20,w9 +.endif +.inst 0x04703507 //xar z7.s,z7.s,z8.s,16 +.if mixin == 1 + ror w20,w20,16 +.endif +.if mixin == 1 + eor w21,w21,w10 +.endif +.inst 0x0470358b //xar z11.s,z11.s,z12.s,16 +.if mixin == 1 + ror w21,w21,16 +.endif +.inst 0x04af014a //add z10.s,z10.s,z15.s +.if mixin == 1 + add w17,w17,w22 +.endif +.inst 0x04a301ce //add z14.s,z14.s,z3.s +.if mixin == 1 + add w18,w18,w19 +.endif +.inst 0x04a70042 //add z2.s,z2.s,z7.s +.if mixin == 1 + add w15,w15,w20 +.endif +.inst 0x04ab00c6 //add z6.s,z6.s,z11.s +.if mixin == 1 + add w16,w16,w21 +.endif +.if mixin == 1 + eor w12,w12,w17 +.endif +.inst 0x046c3545 //xar z5.s,z5.s,z10.s,20 +.if mixin == 1 + ror w12,w12,20 +.endif +.if mixin == 1 + eor w13,w13,w18 +.endif +.inst 0x046c35c9 //xar z9.s,z9.s,z14.s,20 +.if mixin == 1 + ror w13,w13,20 +.endif +.if mixin == 1 + eor w14,w14,w15 +.endif +.inst 0x046c344d //xar z13.s,z13.s,z2.s,20 +.if mixin == 1 + ror w14,w14,20 +.endif +.if mixin == 1 + eor w11,w11,w16 +.endif +.inst 0x046c34c1 //xar z1.s,z1.s,z6.s,20 +.if mixin == 1 + ror w11,w11,20 +.endif +.inst 0x04a50000 //add z0.s,z0.s,z5.s +.if mixin == 1 + add w7,w7,w12 +.endif +.inst 0x04a90084 //add z4.s,z4.s,z9.s +.if mixin == 1 + add w8,w8,w13 +.endif +.inst 0x04ad0108 //add z8.s,z8.s,z13.s +.if mixin == 1 + add w9,w9,w14 +.endif +.inst 0x04a1018c //add z12.s,z12.s,z1.s +.if mixin == 1 + add w10,w10,w11 +.endif +.if mixin == 1 + eor w22,w22,w7 +.endif +.inst 0x0468340f //xar z15.s,z15.s,z0.s,24 +.if mixin == 1 + ror w22,w22,24 +.endif +.if mixin == 1 + eor w19,w19,w8 +.endif +.inst 0x04683483 //xar z3.s,z3.s,z4.s,24 +.if mixin == 1 + ror w19,w19,24 +.endif +.if mixin == 1 + eor w20,w20,w9 +.endif +.inst 0x04683507 //xar z7.s,z7.s,z8.s,24 +.if mixin == 1 + ror w20,w20,24 +.endif +.if mixin == 1 + eor w21,w21,w10 +.endif +.inst 0x0468358b //xar z11.s,z11.s,z12.s,24 +.if mixin == 1 + ror w21,w21,24 +.endif +.inst 0x04af014a //add z10.s,z10.s,z15.s +.if mixin == 1 + add w17,w17,w22 +.endif +.inst 0x04a301ce //add z14.s,z14.s,z3.s +.if mixin == 1 + add w18,w18,w19 +.endif +.inst 0x04a70042 //add z2.s,z2.s,z7.s +.if mixin == 1 + add w15,w15,w20 +.endif +.inst 0x04ab00c6 //add z6.s,z6.s,z11.s +.if mixin == 1 + add w16,w16,w21 +.endif +.if mixin == 1 + eor w12,w12,w17 +.endif +.inst 0x04673545 //xar z5.s,z5.s,z10.s,25 +.if mixin == 1 + ror w12,w12,25 +.endif +.if mixin == 1 + eor w13,w13,w18 +.endif +.inst 0x046735c9 //xar z9.s,z9.s,z14.s,25 +.if mixin == 1 + ror w13,w13,25 +.endif +.if mixin == 1 + eor w14,w14,w15 +.endif +.inst 0x0467344d //xar z13.s,z13.s,z2.s,25 +.if mixin == 1 + ror w14,w14,25 +.endif +.if mixin == 1 + eor w11,w11,w16 +.endif +.inst 0x046734c1 //xar z1.s,z1.s,z6.s,25 +.if mixin == 1 + ror w11,w11,25 +.endif + sub x6,x6,1 + cbnz x6,10b +.if mixin == 1 + add w7,w7,w23 +.endif +.inst 0x04b90000 //add z0.s,z0.s,z25.s +.if mixin == 1 + add x8,x8,x23,lsr #32 +.endif +.inst 0x04ba0084 //add z4.s,z4.s,z26.s +.if mixin == 1 + add x7,x7,x8,lsl #32 // pack +.endif +.if mixin == 1 + add w9,w9,w24 +.endif +.inst 0x04bb0108 //add z8.s,z8.s,z27.s +.if mixin == 1 + add x10,x10,x24,lsr #32 +.endif +.inst 0x04bc018c //add z12.s,z12.s,z28.s +.if mixin == 1 + add x9,x9,x10,lsl #32 // pack +.endif +.if mixin == 1 + ldp x8,x10,[x1],#16 +.endif +.if mixin == 1 + add w11,w11,w25 +.endif +.inst 0x04bd0021 //add z1.s,z1.s,z29.s +.if mixin == 1 + add x12,x12,x25,lsr #32 +.endif +.inst 0x04be00a5 //add z5.s,z5.s,z30.s +.if mixin == 1 + add x11,x11,x12,lsl #32 // pack +.endif +.if mixin == 1 + add w13,w13,w26 +.endif +.inst 0x04b50129 //add z9.s,z9.s,z21.s +.if mixin == 1 + add x14,x14,x26,lsr #32 +.endif +.inst 0x04b601ad //add z13.s,z13.s,z22.s +.if mixin == 1 + add x13,x13,x14,lsl #32 // pack +.endif +.if mixin == 1 + ldp x12,x14,[x1],#16 +.endif +.if mixin == 1 + add w15,w15,w27 +.endif +.inst 0x04b70042 //add z2.s,z2.s,z23.s +.if mixin == 1 + add x16,x16,x27,lsr #32 +.endif +.inst 0x04b800c6 //add z6.s,z6.s,z24.s +.if mixin == 1 + add x15,x15,x16,lsl #32 // pack +.endif +.if mixin == 1 + add w17,w17,w28 +.endif +.inst 0x04b1014a //add z10.s,z10.s,z17.s +.if mixin == 1 + add x18,x18,x28,lsr #32 +.endif +.inst 0x04b201ce //add z14.s,z14.s,z18.s +.if mixin == 1 + add x17,x17,x18,lsl #32 // pack +.endif +.if mixin == 1 + ldp x16,x18,[x1],#16 +.endif +.if mixin == 1 + add w19,w19,w29 +.endif +.inst 0x04b00063 //add z3.s,z3.s,z16.s +.if mixin == 1 + add x20,x20,x29,lsr #32 +.endif +.inst 0x04b300e7 //add z7.s,z7.s,z19.s +.if mixin == 1 + add x19,x19,x20,lsl #32 // pack +.endif +.if mixin == 1 + add w21,w21,w30 +.endif +.inst 0x04b4016b //add z11.s,z11.s,z20.s +.if mixin == 1 + add x22,x22,x30,lsr #32 +.endif +.inst 0x04bf01ef //add z15.s,z15.s,z31.s +.if mixin == 1 + add x21,x21,x22,lsl #32 // pack +.endif +.if mixin == 1 + ldp x20,x22,[x1],#16 +.endif +#ifdef __AARCH64EB__ + rev x7,x7 +.inst 0x05a48000 //revb z0.s,p0/m,z0.s +.inst 0x05a48084 //revb z4.s,p0/m,z4.s + rev x9,x9 +.inst 0x05a48108 //revb z8.s,p0/m,z8.s +.inst 0x05a4818c //revb z12.s,p0/m,z12.s + rev x11,x11 +.inst 0x05a48021 //revb z1.s,p0/m,z1.s +.inst 0x05a480a5 //revb z5.s,p0/m,z5.s + rev x13,x13 +.inst 0x05a48129 //revb z9.s,p0/m,z9.s +.inst 0x05a481ad //revb z13.s,p0/m,z13.s + rev x15,x15 +.inst 0x05a48042 //revb z2.s,p0/m,z2.s +.inst 0x05a480c6 //revb z6.s,p0/m,z6.s + rev x17,x17 +.inst 0x05a4814a //revb z10.s,p0/m,z10.s +.inst 0x05a481ce //revb z14.s,p0/m,z14.s + rev x19,x19 +.inst 0x05a48063 //revb z3.s,p0/m,z3.s +.inst 0x05a480e7 //revb z7.s,p0/m,z7.s + rev x21,x21 +.inst 0x05a4816b //revb z11.s,p0/m,z11.s +.inst 0x05a481ef //revb z15.s,p0/m,z15.s +#endif +.if mixin == 1 + add x29,x29,#1 +.endif + cmp x5,4 + b.ne 200f +.if mixin == 1 + eor x7,x7,x8 +.endif +.if mixin == 1 + eor x9,x9,x10 +.endif +.if mixin == 1 + eor x11,x11,x12 +.endif +.inst 0x05a46011 //zip1 z17.s,z0.s,z4.s +.inst 0x05a46412 //zip2 z18.s,z0.s,z4.s +.inst 0x05ac6113 //zip1 z19.s,z8.s,z12.s +.inst 0x05ac6514 //zip2 z20.s,z8.s,z12.s + +.inst 0x05a56035 //zip1 z21.s,z1.s,z5.s +.inst 0x05a56436 //zip2 z22.s,z1.s,z5.s +.inst 0x05ad6137 //zip1 z23.s,z9.s,z13.s +.inst 0x05ad6538 //zip2 z24.s,z9.s,z13.s + +.inst 0x05f36220 //zip1 z0.d,z17.d,z19.d +.inst 0x05f36624 //zip2 z4.d,z17.d,z19.d +.inst 0x05f46248 //zip1 z8.d,z18.d,z20.d +.inst 0x05f4664c //zip2 z12.d,z18.d,z20.d + +.inst 0x05f762a1 //zip1 z1.d,z21.d,z23.d +.inst 0x05f766a5 //zip2 z5.d,z21.d,z23.d +.inst 0x05f862c9 //zip1 z9.d,z22.d,z24.d +.inst 0x05f866cd //zip2 z13.d,z22.d,z24.d +.if mixin == 1 + eor x13,x13,x14 +.endif +.if mixin == 1 + eor x15,x15,x16 +.endif +.if mixin == 1 + eor x17,x17,x18 +.endif +.inst 0x05a66051 //zip1 z17.s,z2.s,z6.s +.inst 0x05a66452 //zip2 z18.s,z2.s,z6.s +.inst 0x05ae6153 //zip1 z19.s,z10.s,z14.s +.inst 0x05ae6554 //zip2 z20.s,z10.s,z14.s + +.inst 0x05a76075 //zip1 z21.s,z3.s,z7.s +.inst 0x05a76476 //zip2 z22.s,z3.s,z7.s +.inst 0x05af6177 //zip1 z23.s,z11.s,z15.s +.inst 0x05af6578 //zip2 z24.s,z11.s,z15.s + +.inst 0x05f36222 //zip1 z2.d,z17.d,z19.d +.inst 0x05f36626 //zip2 z6.d,z17.d,z19.d +.inst 0x05f4624a //zip1 z10.d,z18.d,z20.d +.inst 0x05f4664e //zip2 z14.d,z18.d,z20.d + +.inst 0x05f762a3 //zip1 z3.d,z21.d,z23.d +.inst 0x05f766a7 //zip2 z7.d,z21.d,z23.d +.inst 0x05f862cb //zip1 z11.d,z22.d,z24.d +.inst 0x05f866cf //zip2 z15.d,z22.d,z24.d +.if mixin == 1 + eor x19,x19,x20 +.endif +.if mixin == 1 + eor x21,x21,x22 +.endif + ld1 {v17.4s,v18.4s,v19.4s,v20.4s},[x1],#64 + ld1 {v21.4s,v22.4s,v23.4s,v24.4s},[x1],#64 +.inst 0x04b13000 //eor z0.d,z0.d,z17.d +.inst 0x04b23021 //eor z1.d,z1.d,z18.d +.inst 0x04b33042 //eor z2.d,z2.d,z19.d +.inst 0x04b43063 //eor z3.d,z3.d,z20.d +.inst 0x04b53084 //eor z4.d,z4.d,z21.d +.inst 0x04b630a5 //eor z5.d,z5.d,z22.d +.inst 0x04b730c6 //eor z6.d,z6.d,z23.d +.inst 0x04b830e7 //eor z7.d,z7.d,z24.d + ld1 {v17.4s,v18.4s,v19.4s,v20.4s},[x1],#64 + ld1 {v21.4s,v22.4s,v23.4s,v24.4s},[x1],#64 +.if mixin == 1 + stp x7,x9,[x0],#16 +.endif +.inst 0x04b13108 //eor z8.d,z8.d,z17.d +.inst 0x04b23129 //eor z9.d,z9.d,z18.d +.if mixin == 1 + stp x11,x13,[x0],#16 +.endif +.inst 0x04b3314a //eor z10.d,z10.d,z19.d +.inst 0x04b4316b //eor z11.d,z11.d,z20.d +.if mixin == 1 + stp x15,x17,[x0],#16 +.endif +.inst 0x04b5318c //eor z12.d,z12.d,z21.d +.inst 0x04b631ad //eor z13.d,z13.d,z22.d +.if mixin == 1 + stp x19,x21,[x0],#16 +.endif +.inst 0x04b731ce //eor z14.d,z14.d,z23.d +.inst 0x04b831ef //eor z15.d,z15.d,z24.d + st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x0],#64 + st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64 + st1 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64 + st1 {v12.4s,v13.4s,v14.4s,v15.4s},[x0],#64 + b 210f +200: +.inst 0x05a16011 //zip1 z17.s,z0.s,z1.s +.inst 0x05a16412 //zip2 z18.s,z0.s,z1.s +.inst 0x05a36053 //zip1 z19.s,z2.s,z3.s +.inst 0x05a36454 //zip2 z20.s,z2.s,z3.s + +.inst 0x05a56095 //zip1 z21.s,z4.s,z5.s +.inst 0x05a56496 //zip2 z22.s,z4.s,z5.s +.inst 0x05a760d7 //zip1 z23.s,z6.s,z7.s +.inst 0x05a764d8 //zip2 z24.s,z6.s,z7.s + +.inst 0x05f36220 //zip1 z0.d,z17.d,z19.d +.inst 0x05f36621 //zip2 z1.d,z17.d,z19.d +.inst 0x05f46242 //zip1 z2.d,z18.d,z20.d +.inst 0x05f46643 //zip2 z3.d,z18.d,z20.d + +.inst 0x05f762a4 //zip1 z4.d,z21.d,z23.d +.inst 0x05f766a5 //zip2 z5.d,z21.d,z23.d +.inst 0x05f862c6 //zip1 z6.d,z22.d,z24.d +.inst 0x05f866c7 //zip2 z7.d,z22.d,z24.d +.if mixin == 1 + eor x7,x7,x8 +.endif +.if mixin == 1 + eor x9,x9,x10 +.endif +.inst 0x05a96111 //zip1 z17.s,z8.s,z9.s +.inst 0x05a96512 //zip2 z18.s,z8.s,z9.s +.inst 0x05ab6153 //zip1 z19.s,z10.s,z11.s +.inst 0x05ab6554 //zip2 z20.s,z10.s,z11.s + +.inst 0x05ad6195 //zip1 z21.s,z12.s,z13.s +.inst 0x05ad6596 //zip2 z22.s,z12.s,z13.s +.inst 0x05af61d7 //zip1 z23.s,z14.s,z15.s +.inst 0x05af65d8 //zip2 z24.s,z14.s,z15.s + +.inst 0x05f36228 //zip1 z8.d,z17.d,z19.d +.inst 0x05f36629 //zip2 z9.d,z17.d,z19.d +.inst 0x05f4624a //zip1 z10.d,z18.d,z20.d +.inst 0x05f4664b //zip2 z11.d,z18.d,z20.d + +.inst 0x05f762ac //zip1 z12.d,z21.d,z23.d +.inst 0x05f766ad //zip2 z13.d,z21.d,z23.d +.inst 0x05f862ce //zip1 z14.d,z22.d,z24.d +.inst 0x05f866cf //zip2 z15.d,z22.d,z24.d +.if mixin == 1 + eor x11,x11,x12 +.endif +.if mixin == 1 + eor x13,x13,x14 +.endif +.inst 0x05a46011 //zip1 z17.s,z0.s,z4.s +.inst 0x05a46412 //zip2 z18.s,z0.s,z4.s +.inst 0x05ac6113 //zip1 z19.s,z8.s,z12.s +.inst 0x05ac6514 //zip2 z20.s,z8.s,z12.s + +.inst 0x05a56035 //zip1 z21.s,z1.s,z5.s +.inst 0x05a56436 //zip2 z22.s,z1.s,z5.s +.inst 0x05ad6137 //zip1 z23.s,z9.s,z13.s +.inst 0x05ad6538 //zip2 z24.s,z9.s,z13.s + +.inst 0x05f36220 //zip1 z0.d,z17.d,z19.d +.inst 0x05f36624 //zip2 z4.d,z17.d,z19.d +.inst 0x05f46248 //zip1 z8.d,z18.d,z20.d +.inst 0x05f4664c //zip2 z12.d,z18.d,z20.d + +.inst 0x05f762a1 //zip1 z1.d,z21.d,z23.d +.inst 0x05f766a5 //zip2 z5.d,z21.d,z23.d +.inst 0x05f862c9 //zip1 z9.d,z22.d,z24.d +.inst 0x05f866cd //zip2 z13.d,z22.d,z24.d +.if mixin == 1 + eor x15,x15,x16 +.endif +.if mixin == 1 + eor x17,x17,x18 +.endif +.inst 0x05a66051 //zip1 z17.s,z2.s,z6.s +.inst 0x05a66452 //zip2 z18.s,z2.s,z6.s +.inst 0x05ae6153 //zip1 z19.s,z10.s,z14.s +.inst 0x05ae6554 //zip2 z20.s,z10.s,z14.s + +.inst 0x05a76075 //zip1 z21.s,z3.s,z7.s +.inst 0x05a76476 //zip2 z22.s,z3.s,z7.s +.inst 0x05af6177 //zip1 z23.s,z11.s,z15.s +.inst 0x05af6578 //zip2 z24.s,z11.s,z15.s + +.inst 0x05f36222 //zip1 z2.d,z17.d,z19.d +.inst 0x05f36626 //zip2 z6.d,z17.d,z19.d +.inst 0x05f4624a //zip1 z10.d,z18.d,z20.d +.inst 0x05f4664e //zip2 z14.d,z18.d,z20.d + +.inst 0x05f762a3 //zip1 z3.d,z21.d,z23.d +.inst 0x05f766a7 //zip2 z7.d,z21.d,z23.d +.inst 0x05f862cb //zip1 z11.d,z22.d,z24.d +.inst 0x05f866cf //zip2 z15.d,z22.d,z24.d +.if mixin == 1 + eor x19,x19,x20 +.endif +.if mixin == 1 + eor x21,x21,x22 +.endif +.inst 0xa540a031 //ld1w {z17.s},p0/z,[x1,#0,MUL VL] +.inst 0xa541a032 //ld1w {z18.s},p0/z,[x1,#1,MUL VL] +.inst 0xa542a033 //ld1w {z19.s},p0/z,[x1,#2,MUL VL] +.inst 0xa543a034 //ld1w {z20.s},p0/z,[x1,#3,MUL VL] +.inst 0xa544a035 //ld1w {z21.s},p0/z,[x1,#4,MUL VL] +.inst 0xa545a036 //ld1w {z22.s},p0/z,[x1,#5,MUL VL] +.inst 0xa546a037 //ld1w {z23.s},p0/z,[x1,#6,MUL VL] +.inst 0xa547a038 //ld1w {z24.s},p0/z,[x1,#7,MUL VL] +.inst 0x04215101 //addvl x1,x1,8 +.inst 0x04b13000 //eor z0.d,z0.d,z17.d +.inst 0x04b23084 //eor z4.d,z4.d,z18.d +.inst 0x04b33108 //eor z8.d,z8.d,z19.d +.inst 0x04b4318c //eor z12.d,z12.d,z20.d +.inst 0x04b53021 //eor z1.d,z1.d,z21.d +.inst 0x04b630a5 //eor z5.d,z5.d,z22.d +.inst 0x04b73129 //eor z9.d,z9.d,z23.d +.inst 0x04b831ad //eor z13.d,z13.d,z24.d +.inst 0xa540a031 //ld1w {z17.s},p0/z,[x1,#0,MUL VL] +.inst 0xa541a032 //ld1w {z18.s},p0/z,[x1,#1,MUL VL] +.inst 0xa542a033 //ld1w {z19.s},p0/z,[x1,#2,MUL VL] +.inst 0xa543a034 //ld1w {z20.s},p0/z,[x1,#3,MUL VL] +.inst 0xa544a035 //ld1w {z21.s},p0/z,[x1,#4,MUL VL] +.inst 0xa545a036 //ld1w {z22.s},p0/z,[x1,#5,MUL VL] +.inst 0xa546a037 //ld1w {z23.s},p0/z,[x1,#6,MUL VL] +.inst 0xa547a038 //ld1w {z24.s},p0/z,[x1,#7,MUL VL] +.inst 0x04215101 //addvl x1,x1,8 +.if mixin == 1 + stp x7,x9,[x0],#16 +.endif +.inst 0x04b13042 //eor z2.d,z2.d,z17.d +.inst 0x04b230c6 //eor z6.d,z6.d,z18.d +.if mixin == 1 + stp x11,x13,[x0],#16 +.endif +.inst 0x04b3314a //eor z10.d,z10.d,z19.d +.inst 0x04b431ce //eor z14.d,z14.d,z20.d +.if mixin == 1 + stp x15,x17,[x0],#16 +.endif +.inst 0x04b53063 //eor z3.d,z3.d,z21.d +.inst 0x04b630e7 //eor z7.d,z7.d,z22.d +.if mixin == 1 + stp x19,x21,[x0],#16 +.endif +.inst 0x04b7316b //eor z11.d,z11.d,z23.d +.inst 0x04b831ef //eor z15.d,z15.d,z24.d +.inst 0xe540e000 //st1w {z0.s},p0,[x0,#0,MUL VL] +.inst 0xe541e004 //st1w {z4.s},p0,[x0,#1,MUL VL] +.inst 0xe542e008 //st1w {z8.s},p0,[x0,#2,MUL VL] +.inst 0xe543e00c //st1w {z12.s},p0,[x0,#3,MUL VL] +.inst 0xe544e001 //st1w {z1.s},p0,[x0,#4,MUL VL] +.inst 0xe545e005 //st1w {z5.s},p0,[x0,#5,MUL VL] +.inst 0xe546e009 //st1w {z9.s},p0,[x0,#6,MUL VL] +.inst 0xe547e00d //st1w {z13.s},p0,[x0,#7,MUL VL] +.inst 0x04205100 //addvl x0,x0,8 +.inst 0xe540e002 //st1w {z2.s},p0,[x0,#0,MUL VL] +.inst 0xe541e006 //st1w {z6.s},p0,[x0,#1,MUL VL] +.inst 0xe542e00a //st1w {z10.s},p0,[x0,#2,MUL VL] +.inst 0xe543e00e //st1w {z14.s},p0,[x0,#3,MUL VL] +.inst 0xe544e003 //st1w {z3.s},p0,[x0,#4,MUL VL] +.inst 0xe545e007 //st1w {z7.s},p0,[x0,#5,MUL VL] +.inst 0xe546e00b //st1w {z11.s},p0,[x0,#6,MUL VL] +.inst 0xe547e00f //st1w {z15.s},p0,[x0,#7,MUL VL] +.inst 0x04205100 //addvl x0,x0,8 +210: +.inst 0x04b0e3fd //incw x29, ALL, MUL #1 +110: + b 2f +1: +.align 5 +100: + subs x7,x2,x5,lsl #6 + b.lt 110f + mov x2,x7 + b.eq 101f + cmp x2,64 + b.lt 101f + mixin=1 + lsr x8,x23,#32 +.inst 0x05a03ae0 //dup z0.s,w23 +.inst 0x05a03af9 //dup z25.s,w23 +.if mixin == 1 + mov w7,w23 +.endif +.inst 0x05a03904 //dup z4.s,w8 +.inst 0x05a0391a //dup z26.s,w8 + lsr x10,x24,#32 +.inst 0x05a03b08 //dup z8.s,w24 +.inst 0x05a03b1b //dup z27.s,w24 +.if mixin == 1 + mov w9,w24 +.endif +.inst 0x05a0394c //dup z12.s,w10 +.inst 0x05a0395c //dup z28.s,w10 + lsr x12,x25,#32 +.inst 0x05a03b21 //dup z1.s,w25 +.inst 0x05a03b3d //dup z29.s,w25 +.if mixin == 1 + mov w11,w25 +.endif +.inst 0x05a03985 //dup z5.s,w12 +.inst 0x05a0399e //dup z30.s,w12 + lsr x14,x26,#32 +.inst 0x05a03b49 //dup z9.s,w26 +.inst 0x05a03b55 //dup z21.s,w26 +.if mixin == 1 + mov w13,w26 +.endif +.inst 0x05a039cd //dup z13.s,w14 +.inst 0x05a039d6 //dup z22.s,w14 + lsr x16,x27,#32 +.inst 0x05a03b62 //dup z2.s,w27 +.inst 0x05a03b77 //dup z23.s,w27 +.if mixin == 1 + mov w15,w27 +.endif +.inst 0x05a03a06 //dup z6.s,w16 +.inst 0x05a03a18 //dup z24.s,w16 + lsr x18,x28,#32 +.inst 0x05a03b8a //dup z10.s,w28 +.if mixin == 1 + mov w17,w28 +.endif +.inst 0x05a03a4e //dup z14.s,w18 + lsr x22,x30,#32 +.inst 0x05a03bcb //dup z11.s,w30 +.if mixin == 1 + mov w21,w30 +.endif +.inst 0x05a03acf //dup z15.s,w22 +.if mixin == 1 + add w20,w29,#1 + mov w19,w29 +.inst 0x04a14690 //index z16.s,w20,1 +.inst 0x04a14683 //index z3.s,w20,1 +.else +.inst 0x04a147b0 //index z16.s,w29,1 +.inst 0x04a147a3 //index z3.s,w29,1 +.endif + lsr x20,x29,#32 +.inst 0x05a03a87 //dup z7.s,w20 + mov x6,#10 +10: +.align 5 +.inst 0x04a10000 //add z0.s,z0.s,z1.s +.if mixin == 1 + add w7,w7,w11 +.endif +.inst 0x04a50084 //add z4.s,z4.s,z5.s +.if mixin == 1 + add w8,w8,w12 +.endif +.inst 0x04a90108 //add z8.s,z8.s,z9.s +.if mixin == 1 + add w9,w9,w13 +.endif +.inst 0x04ad018c //add z12.s,z12.s,z13.s +.if mixin == 1 + add w10,w10,w14 +.endif +.inst 0x04a03063 //eor z3.d,z3.d,z0.d +.if mixin == 1 + eor w19,w19,w7 +.endif +.inst 0x04a430e7 //eor z7.d,z7.d,z4.d +.if mixin == 1 + eor w20,w20,w8 +.endif +.inst 0x04a8316b //eor z11.d,z11.d,z8.d +.if mixin == 1 + eor w21,w21,w9 +.endif +.inst 0x04ac31ef //eor z15.d,z15.d,z12.d +.if mixin == 1 + eor w22,w22,w10 +.endif +.inst 0x05a58063 //revh z3.s,p0/m,z3.s +.if mixin == 1 + ror w19,w19,#16 +.endif +.inst 0x05a580e7 //revh z7.s,p0/m,z7.s +.if mixin == 1 + ror w20,w20,#16 +.endif +.inst 0x05a5816b //revh z11.s,p0/m,z11.s +.if mixin == 1 + ror w21,w21,#16 +.endif +.inst 0x05a581ef //revh z15.s,p0/m,z15.s +.if mixin == 1 + ror w22,w22,#16 +.endif +.inst 0x04a30042 //add z2.s,z2.s,z3.s +.if mixin == 1 + add w15,w15,w19 +.endif +.inst 0x04a700c6 //add z6.s,z6.s,z7.s +.if mixin == 1 + add w16,w16,w20 +.endif +.inst 0x04ab014a //add z10.s,z10.s,z11.s +.if mixin == 1 + add w17,w17,w21 +.endif +.inst 0x04af01ce //add z14.s,z14.s,z15.s +.if mixin == 1 + add w18,w18,w22 +.endif +.inst 0x04a23021 //eor z1.d,z1.d,z2.d +.if mixin == 1 + eor w11,w11,w15 +.endif +.inst 0x04a630a5 //eor z5.d,z5.d,z6.d +.if mixin == 1 + eor w12,w12,w16 +.endif +.inst 0x04aa3129 //eor z9.d,z9.d,z10.d +.if mixin == 1 + eor w13,w13,w17 +.endif +.inst 0x04ae31ad //eor z13.d,z13.d,z14.d +.if mixin == 1 + eor w14,w14,w18 +.endif +.inst 0x046c9c31 //lsl z17.s,z1.s,12 +.inst 0x046c9cb2 //lsl z18.s,z5.s,12 +.inst 0x046c9d33 //lsl z19.s,z9.s,12 +.inst 0x046c9db4 //lsl z20.s,z13.s,12 +.inst 0x046c9421 //lsr z1.s,z1.s,20 +.if mixin == 1 + ror w11,w11,20 +.endif +.inst 0x046c94a5 //lsr z5.s,z5.s,20 +.if mixin == 1 + ror w12,w12,20 +.endif +.inst 0x046c9529 //lsr z9.s,z9.s,20 +.if mixin == 1 + ror w13,w13,20 +.endif +.inst 0x046c95ad //lsr z13.s,z13.s,20 +.if mixin == 1 + ror w14,w14,20 +.endif +.inst 0x04713021 //orr z1.d,z1.d,z17.d +.inst 0x047230a5 //orr z5.d,z5.d,z18.d +.inst 0x04733129 //orr z9.d,z9.d,z19.d +.inst 0x047431ad //orr z13.d,z13.d,z20.d +.inst 0x04a10000 //add z0.s,z0.s,z1.s +.if mixin == 1 + add w7,w7,w11 +.endif +.inst 0x04a50084 //add z4.s,z4.s,z5.s +.if mixin == 1 + add w8,w8,w12 +.endif +.inst 0x04a90108 //add z8.s,z8.s,z9.s +.if mixin == 1 + add w9,w9,w13 +.endif +.inst 0x04ad018c //add z12.s,z12.s,z13.s +.if mixin == 1 + add w10,w10,w14 +.endif +.inst 0x04a03063 //eor z3.d,z3.d,z0.d +.if mixin == 1 + eor w19,w19,w7 +.endif +.inst 0x04a430e7 //eor z7.d,z7.d,z4.d +.if mixin == 1 + eor w20,w20,w8 +.endif +.inst 0x04a8316b //eor z11.d,z11.d,z8.d +.if mixin == 1 + eor w21,w21,w9 +.endif +.inst 0x04ac31ef //eor z15.d,z15.d,z12.d +.if mixin == 1 + eor w22,w22,w10 +.endif +.inst 0x053f3063 //tbl z3.b,{z3.b},z31.b +.if mixin == 1 + ror w19,w19,#24 +.endif +.inst 0x053f30e7 //tbl z7.b,{z7.b},z31.b +.if mixin == 1 + ror w20,w20,#24 +.endif +.inst 0x053f316b //tbl z11.b,{z11.b},z31.b +.if mixin == 1 + ror w21,w21,#24 +.endif +.inst 0x053f31ef //tbl z15.b,{z15.b},z31.b +.if mixin == 1 + ror w22,w22,#24 +.endif +.inst 0x04a30042 //add z2.s,z2.s,z3.s +.if mixin == 1 + add w15,w15,w19 +.endif +.inst 0x04a700c6 //add z6.s,z6.s,z7.s +.if mixin == 1 + add w16,w16,w20 +.endif +.inst 0x04ab014a //add z10.s,z10.s,z11.s +.if mixin == 1 + add w17,w17,w21 +.endif +.inst 0x04af01ce //add z14.s,z14.s,z15.s +.if mixin == 1 + add w18,w18,w22 +.endif +.inst 0x04a23021 //eor z1.d,z1.d,z2.d +.if mixin == 1 + eor w11,w11,w15 +.endif +.inst 0x04a630a5 //eor z5.d,z5.d,z6.d +.if mixin == 1 + eor w12,w12,w16 +.endif +.inst 0x04aa3129 //eor z9.d,z9.d,z10.d +.if mixin == 1 + eor w13,w13,w17 +.endif +.inst 0x04ae31ad //eor z13.d,z13.d,z14.d +.if mixin == 1 + eor w14,w14,w18 +.endif +.inst 0x04679c31 //lsl z17.s,z1.s,7 +.inst 0x04679cb2 //lsl z18.s,z5.s,7 +.inst 0x04679d33 //lsl z19.s,z9.s,7 +.inst 0x04679db4 //lsl z20.s,z13.s,7 +.inst 0x04679421 //lsr z1.s,z1.s,25 +.if mixin == 1 + ror w11,w11,25 +.endif +.inst 0x046794a5 //lsr z5.s,z5.s,25 +.if mixin == 1 + ror w12,w12,25 +.endif +.inst 0x04679529 //lsr z9.s,z9.s,25 +.if mixin == 1 + ror w13,w13,25 +.endif +.inst 0x046795ad //lsr z13.s,z13.s,25 +.if mixin == 1 + ror w14,w14,25 +.endif +.inst 0x04713021 //orr z1.d,z1.d,z17.d +.inst 0x047230a5 //orr z5.d,z5.d,z18.d +.inst 0x04733129 //orr z9.d,z9.d,z19.d +.inst 0x047431ad //orr z13.d,z13.d,z20.d +.inst 0x04a50000 //add z0.s,z0.s,z5.s +.if mixin == 1 + add w7,w7,w12 +.endif +.inst 0x04a90084 //add z4.s,z4.s,z9.s +.if mixin == 1 + add w8,w8,w13 +.endif +.inst 0x04ad0108 //add z8.s,z8.s,z13.s +.if mixin == 1 + add w9,w9,w14 +.endif +.inst 0x04a1018c //add z12.s,z12.s,z1.s +.if mixin == 1 + add w10,w10,w11 +.endif +.inst 0x04a031ef //eor z15.d,z15.d,z0.d +.if mixin == 1 + eor w22,w22,w7 +.endif +.inst 0x04a43063 //eor z3.d,z3.d,z4.d +.if mixin == 1 + eor w19,w19,w8 +.endif +.inst 0x04a830e7 //eor z7.d,z7.d,z8.d +.if mixin == 1 + eor w20,w20,w9 +.endif +.inst 0x04ac316b //eor z11.d,z11.d,z12.d +.if mixin == 1 + eor w21,w21,w10 +.endif +.inst 0x05a581ef //revh z15.s,p0/m,z15.s +.if mixin == 1 + ror w22,w22,#16 +.endif +.inst 0x05a58063 //revh z3.s,p0/m,z3.s +.if mixin == 1 + ror w19,w19,#16 +.endif +.inst 0x05a580e7 //revh z7.s,p0/m,z7.s +.if mixin == 1 + ror w20,w20,#16 +.endif +.inst 0x05a5816b //revh z11.s,p0/m,z11.s +.if mixin == 1 + ror w21,w21,#16 +.endif +.inst 0x04af014a //add z10.s,z10.s,z15.s +.if mixin == 1 + add w17,w17,w22 +.endif +.inst 0x04a301ce //add z14.s,z14.s,z3.s +.if mixin == 1 + add w18,w18,w19 +.endif +.inst 0x04a70042 //add z2.s,z2.s,z7.s +.if mixin == 1 + add w15,w15,w20 +.endif +.inst 0x04ab00c6 //add z6.s,z6.s,z11.s +.if mixin == 1 + add w16,w16,w21 +.endif +.inst 0x04aa30a5 //eor z5.d,z5.d,z10.d +.if mixin == 1 + eor w12,w12,w17 +.endif +.inst 0x04ae3129 //eor z9.d,z9.d,z14.d +.if mixin == 1 + eor w13,w13,w18 +.endif +.inst 0x04a231ad //eor z13.d,z13.d,z2.d +.if mixin == 1 + eor w14,w14,w15 +.endif +.inst 0x04a63021 //eor z1.d,z1.d,z6.d +.if mixin == 1 + eor w11,w11,w16 +.endif +.inst 0x046c9cb1 //lsl z17.s,z5.s,12 +.inst 0x046c9d32 //lsl z18.s,z9.s,12 +.inst 0x046c9db3 //lsl z19.s,z13.s,12 +.inst 0x046c9c34 //lsl z20.s,z1.s,12 +.inst 0x046c94a5 //lsr z5.s,z5.s,20 +.if mixin == 1 + ror w12,w12,20 +.endif +.inst 0x046c9529 //lsr z9.s,z9.s,20 +.if mixin == 1 + ror w13,w13,20 +.endif +.inst 0x046c95ad //lsr z13.s,z13.s,20 +.if mixin == 1 + ror w14,w14,20 +.endif +.inst 0x046c9421 //lsr z1.s,z1.s,20 +.if mixin == 1 + ror w11,w11,20 +.endif +.inst 0x047130a5 //orr z5.d,z5.d,z17.d +.inst 0x04723129 //orr z9.d,z9.d,z18.d +.inst 0x047331ad //orr z13.d,z13.d,z19.d +.inst 0x04743021 //orr z1.d,z1.d,z20.d +.inst 0x04a50000 //add z0.s,z0.s,z5.s +.if mixin == 1 + add w7,w7,w12 +.endif +.inst 0x04a90084 //add z4.s,z4.s,z9.s +.if mixin == 1 + add w8,w8,w13 +.endif +.inst 0x04ad0108 //add z8.s,z8.s,z13.s +.if mixin == 1 + add w9,w9,w14 +.endif +.inst 0x04a1018c //add z12.s,z12.s,z1.s +.if mixin == 1 + add w10,w10,w11 +.endif +.inst 0x04a031ef //eor z15.d,z15.d,z0.d +.if mixin == 1 + eor w22,w22,w7 +.endif +.inst 0x04a43063 //eor z3.d,z3.d,z4.d +.if mixin == 1 + eor w19,w19,w8 +.endif +.inst 0x04a830e7 //eor z7.d,z7.d,z8.d +.if mixin == 1 + eor w20,w20,w9 +.endif +.inst 0x04ac316b //eor z11.d,z11.d,z12.d +.if mixin == 1 + eor w21,w21,w10 +.endif +.inst 0x053f31ef //tbl z15.b,{z15.b},z31.b +.if mixin == 1 + ror w22,w22,#24 +.endif +.inst 0x053f3063 //tbl z3.b,{z3.b},z31.b +.if mixin == 1 + ror w19,w19,#24 +.endif +.inst 0x053f30e7 //tbl z7.b,{z7.b},z31.b +.if mixin == 1 + ror w20,w20,#24 +.endif +.inst 0x053f316b //tbl z11.b,{z11.b},z31.b +.if mixin == 1 + ror w21,w21,#24 +.endif +.inst 0x04af014a //add z10.s,z10.s,z15.s +.if mixin == 1 + add w17,w17,w22 +.endif +.inst 0x04a301ce //add z14.s,z14.s,z3.s +.if mixin == 1 + add w18,w18,w19 +.endif +.inst 0x04a70042 //add z2.s,z2.s,z7.s +.if mixin == 1 + add w15,w15,w20 +.endif +.inst 0x04ab00c6 //add z6.s,z6.s,z11.s +.if mixin == 1 + add w16,w16,w21 +.endif +.inst 0x04aa30a5 //eor z5.d,z5.d,z10.d +.if mixin == 1 + eor w12,w12,w17 +.endif +.inst 0x04ae3129 //eor z9.d,z9.d,z14.d +.if mixin == 1 + eor w13,w13,w18 +.endif +.inst 0x04a231ad //eor z13.d,z13.d,z2.d +.if mixin == 1 + eor w14,w14,w15 +.endif +.inst 0x04a63021 //eor z1.d,z1.d,z6.d +.if mixin == 1 + eor w11,w11,w16 +.endif +.inst 0x04679cb1 //lsl z17.s,z5.s,7 +.inst 0x04679d32 //lsl z18.s,z9.s,7 +.inst 0x04679db3 //lsl z19.s,z13.s,7 +.inst 0x04679c34 //lsl z20.s,z1.s,7 +.inst 0x046794a5 //lsr z5.s,z5.s,25 +.if mixin == 1 + ror w12,w12,25 +.endif +.inst 0x04679529 //lsr z9.s,z9.s,25 +.if mixin == 1 + ror w13,w13,25 +.endif +.inst 0x046795ad //lsr z13.s,z13.s,25 +.if mixin == 1 + ror w14,w14,25 +.endif +.inst 0x04679421 //lsr z1.s,z1.s,25 +.if mixin == 1 + ror w11,w11,25 +.endif +.inst 0x047130a5 //orr z5.d,z5.d,z17.d +.inst 0x04723129 //orr z9.d,z9.d,z18.d +.inst 0x047331ad //orr z13.d,z13.d,z19.d +.inst 0x04743021 //orr z1.d,z1.d,z20.d + sub x6,x6,1 + cbnz x6,10b + lsr x6,x28,#32 +.inst 0x05a03b91 //dup z17.s,w28 +.inst 0x05a038d2 //dup z18.s,w6 + lsr x6,x29,#32 +.inst 0x05a038d3 //dup z19.s,w6 + lsr x6,x30,#32 +.if mixin == 1 + add w7,w7,w23 +.endif +.inst 0x04b90000 //add z0.s,z0.s,z25.s +.if mixin == 1 + add x8,x8,x23,lsr #32 +.endif +.inst 0x04ba0084 //add z4.s,z4.s,z26.s +.if mixin == 1 + add x7,x7,x8,lsl #32 // pack +.endif +.if mixin == 1 + add w9,w9,w24 +.endif +.inst 0x04bb0108 //add z8.s,z8.s,z27.s +.if mixin == 1 + add x10,x10,x24,lsr #32 +.endif +.inst 0x04bc018c //add z12.s,z12.s,z28.s +.if mixin == 1 + add x9,x9,x10,lsl #32 // pack +.endif +.if mixin == 1 + ldp x8,x10,[x1],#16 +.endif +.if mixin == 1 + add w11,w11,w25 +.endif +.inst 0x04bd0021 //add z1.s,z1.s,z29.s +.if mixin == 1 + add x12,x12,x25,lsr #32 +.endif +.inst 0x04be00a5 //add z5.s,z5.s,z30.s +.if mixin == 1 + add x11,x11,x12,lsl #32 // pack +.endif +.if mixin == 1 + add w13,w13,w26 +.endif +.inst 0x04b50129 //add z9.s,z9.s,z21.s +.if mixin == 1 + add x14,x14,x26,lsr #32 +.endif +.inst 0x04b601ad //add z13.s,z13.s,z22.s +.if mixin == 1 + add x13,x13,x14,lsl #32 // pack +.endif +.if mixin == 1 + ldp x12,x14,[x1],#16 +.endif +.if mixin == 1 + add w15,w15,w27 +.endif +.inst 0x04b70042 //add z2.s,z2.s,z23.s +.if mixin == 1 + add x16,x16,x27,lsr #32 +.endif +.inst 0x04b800c6 //add z6.s,z6.s,z24.s +.if mixin == 1 + add x15,x15,x16,lsl #32 // pack +.endif +.if mixin == 1 + add w17,w17,w28 +.endif +.inst 0x04b1014a //add z10.s,z10.s,z17.s +.if mixin == 1 + add x18,x18,x28,lsr #32 +.endif +.inst 0x04b201ce //add z14.s,z14.s,z18.s +.if mixin == 1 + add x17,x17,x18,lsl #32 // pack +.endif +.if mixin == 1 + ldp x16,x18,[x1],#16 +.endif +.inst 0x05a03bd4 //dup z20.s,w30 +.inst 0x05a038d9 //dup z25.s,w6 // bak[15] not available for SVE +.if mixin == 1 + add w19,w19,w29 +.endif +.inst 0x04b00063 //add z3.s,z3.s,z16.s +.if mixin == 1 + add x20,x20,x29,lsr #32 +.endif +.inst 0x04b300e7 //add z7.s,z7.s,z19.s +.if mixin == 1 + add x19,x19,x20,lsl #32 // pack +.endif +.if mixin == 1 + add w21,w21,w30 +.endif +.inst 0x04b4016b //add z11.s,z11.s,z20.s +.if mixin == 1 + add x22,x22,x30,lsr #32 +.endif +.inst 0x04b901ef //add z15.s,z15.s,z25.s +.if mixin == 1 + add x21,x21,x22,lsl #32 // pack +.endif +.if mixin == 1 + ldp x20,x22,[x1],#16 +.endif +#ifdef __AARCH64EB__ + rev x7,x7 +.inst 0x05a48000 //revb z0.s,p0/m,z0.s +.inst 0x05a48084 //revb z4.s,p0/m,z4.s + rev x9,x9 +.inst 0x05a48108 //revb z8.s,p0/m,z8.s +.inst 0x05a4818c //revb z12.s,p0/m,z12.s + rev x11,x11 +.inst 0x05a48021 //revb z1.s,p0/m,z1.s +.inst 0x05a480a5 //revb z5.s,p0/m,z5.s + rev x13,x13 +.inst 0x05a48129 //revb z9.s,p0/m,z9.s +.inst 0x05a481ad //revb z13.s,p0/m,z13.s + rev x15,x15 +.inst 0x05a48042 //revb z2.s,p0/m,z2.s +.inst 0x05a480c6 //revb z6.s,p0/m,z6.s + rev x17,x17 +.inst 0x05a4814a //revb z10.s,p0/m,z10.s +.inst 0x05a481ce //revb z14.s,p0/m,z14.s + rev x19,x19 +.inst 0x05a48063 //revb z3.s,p0/m,z3.s +.inst 0x05a480e7 //revb z7.s,p0/m,z7.s + rev x21,x21 +.inst 0x05a4816b //revb z11.s,p0/m,z11.s +.inst 0x05a481ef //revb z15.s,p0/m,z15.s +#endif +.if mixin == 1 + add x29,x29,#1 +.endif + cmp x5,4 + b.ne 200f +.if mixin == 1 + eor x7,x7,x8 +.endif +.if mixin == 1 + eor x9,x9,x10 +.endif +.if mixin == 1 + eor x11,x11,x12 +.endif +.inst 0x05a46011 //zip1 z17.s,z0.s,z4.s +.inst 0x05a46412 //zip2 z18.s,z0.s,z4.s +.inst 0x05ac6113 //zip1 z19.s,z8.s,z12.s +.inst 0x05ac6514 //zip2 z20.s,z8.s,z12.s + +.inst 0x05a56035 //zip1 z21.s,z1.s,z5.s +.inst 0x05a56436 //zip2 z22.s,z1.s,z5.s +.inst 0x05ad6137 //zip1 z23.s,z9.s,z13.s +.inst 0x05ad6538 //zip2 z24.s,z9.s,z13.s + +.inst 0x05f36220 //zip1 z0.d,z17.d,z19.d +.inst 0x05f36624 //zip2 z4.d,z17.d,z19.d +.inst 0x05f46248 //zip1 z8.d,z18.d,z20.d +.inst 0x05f4664c //zip2 z12.d,z18.d,z20.d + +.inst 0x05f762a1 //zip1 z1.d,z21.d,z23.d +.inst 0x05f766a5 //zip2 z5.d,z21.d,z23.d +.inst 0x05f862c9 //zip1 z9.d,z22.d,z24.d +.inst 0x05f866cd //zip2 z13.d,z22.d,z24.d +.if mixin == 1 + eor x13,x13,x14 +.endif +.if mixin == 1 + eor x15,x15,x16 +.endif +.if mixin == 1 + eor x17,x17,x18 +.endif +.inst 0x05a66051 //zip1 z17.s,z2.s,z6.s +.inst 0x05a66452 //zip2 z18.s,z2.s,z6.s +.inst 0x05ae6153 //zip1 z19.s,z10.s,z14.s +.inst 0x05ae6554 //zip2 z20.s,z10.s,z14.s + +.inst 0x05a76075 //zip1 z21.s,z3.s,z7.s +.inst 0x05a76476 //zip2 z22.s,z3.s,z7.s +.inst 0x05af6177 //zip1 z23.s,z11.s,z15.s +.inst 0x05af6578 //zip2 z24.s,z11.s,z15.s + +.inst 0x05f36222 //zip1 z2.d,z17.d,z19.d +.inst 0x05f36626 //zip2 z6.d,z17.d,z19.d +.inst 0x05f4624a //zip1 z10.d,z18.d,z20.d +.inst 0x05f4664e //zip2 z14.d,z18.d,z20.d + +.inst 0x05f762a3 //zip1 z3.d,z21.d,z23.d +.inst 0x05f766a7 //zip2 z7.d,z21.d,z23.d +.inst 0x05f862cb //zip1 z11.d,z22.d,z24.d +.inst 0x05f866cf //zip2 z15.d,z22.d,z24.d +.if mixin == 1 + eor x19,x19,x20 +.endif +.if mixin == 1 + eor x21,x21,x22 +.endif + ld1 {v17.4s,v18.4s,v19.4s,v20.4s},[x1],#64 + ld1 {v21.4s,v22.4s,v23.4s,v24.4s},[x1],#64 +.inst 0x04b13000 //eor z0.d,z0.d,z17.d +.inst 0x04b23021 //eor z1.d,z1.d,z18.d +.inst 0x04b33042 //eor z2.d,z2.d,z19.d +.inst 0x04b43063 //eor z3.d,z3.d,z20.d +.inst 0x04b53084 //eor z4.d,z4.d,z21.d +.inst 0x04b630a5 //eor z5.d,z5.d,z22.d +.inst 0x04b730c6 //eor z6.d,z6.d,z23.d +.inst 0x04b830e7 //eor z7.d,z7.d,z24.d + ld1 {v17.4s,v18.4s,v19.4s,v20.4s},[x1],#64 + ld1 {v21.4s,v22.4s,v23.4s,v24.4s},[x1],#64 +.if mixin == 1 + stp x7,x9,[x0],#16 +.endif +.inst 0x04b13108 //eor z8.d,z8.d,z17.d +.inst 0x04b23129 //eor z9.d,z9.d,z18.d +.if mixin == 1 + stp x11,x13,[x0],#16 +.endif +.inst 0x04b3314a //eor z10.d,z10.d,z19.d +.inst 0x04b4316b //eor z11.d,z11.d,z20.d +.if mixin == 1 + stp x15,x17,[x0],#16 +.endif +.inst 0x04b5318c //eor z12.d,z12.d,z21.d +.inst 0x04b631ad //eor z13.d,z13.d,z22.d +.if mixin == 1 + stp x19,x21,[x0],#16 +.endif +.inst 0x04b731ce //eor z14.d,z14.d,z23.d +.inst 0x04b831ef //eor z15.d,z15.d,z24.d + st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x0],#64 + st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64 + st1 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64 + st1 {v12.4s,v13.4s,v14.4s,v15.4s},[x0],#64 + b 210f +200: +.inst 0x05a16011 //zip1 z17.s,z0.s,z1.s +.inst 0x05a16412 //zip2 z18.s,z0.s,z1.s +.inst 0x05a36053 //zip1 z19.s,z2.s,z3.s +.inst 0x05a36454 //zip2 z20.s,z2.s,z3.s + +.inst 0x05a56095 //zip1 z21.s,z4.s,z5.s +.inst 0x05a56496 //zip2 z22.s,z4.s,z5.s +.inst 0x05a760d7 //zip1 z23.s,z6.s,z7.s +.inst 0x05a764d8 //zip2 z24.s,z6.s,z7.s + +.inst 0x05f36220 //zip1 z0.d,z17.d,z19.d +.inst 0x05f36621 //zip2 z1.d,z17.d,z19.d +.inst 0x05f46242 //zip1 z2.d,z18.d,z20.d +.inst 0x05f46643 //zip2 z3.d,z18.d,z20.d + +.inst 0x05f762a4 //zip1 z4.d,z21.d,z23.d +.inst 0x05f766a5 //zip2 z5.d,z21.d,z23.d +.inst 0x05f862c6 //zip1 z6.d,z22.d,z24.d +.inst 0x05f866c7 //zip2 z7.d,z22.d,z24.d +.if mixin == 1 + eor x7,x7,x8 +.endif +.if mixin == 1 + eor x9,x9,x10 +.endif +.inst 0x05a96111 //zip1 z17.s,z8.s,z9.s +.inst 0x05a96512 //zip2 z18.s,z8.s,z9.s +.inst 0x05ab6153 //zip1 z19.s,z10.s,z11.s +.inst 0x05ab6554 //zip2 z20.s,z10.s,z11.s + +.inst 0x05ad6195 //zip1 z21.s,z12.s,z13.s +.inst 0x05ad6596 //zip2 z22.s,z12.s,z13.s +.inst 0x05af61d7 //zip1 z23.s,z14.s,z15.s +.inst 0x05af65d8 //zip2 z24.s,z14.s,z15.s + +.inst 0x05f36228 //zip1 z8.d,z17.d,z19.d +.inst 0x05f36629 //zip2 z9.d,z17.d,z19.d +.inst 0x05f4624a //zip1 z10.d,z18.d,z20.d +.inst 0x05f4664b //zip2 z11.d,z18.d,z20.d + +.inst 0x05f762ac //zip1 z12.d,z21.d,z23.d +.inst 0x05f766ad //zip2 z13.d,z21.d,z23.d +.inst 0x05f862ce //zip1 z14.d,z22.d,z24.d +.inst 0x05f866cf //zip2 z15.d,z22.d,z24.d +.if mixin == 1 + eor x11,x11,x12 +.endif +.if mixin == 1 + eor x13,x13,x14 +.endif +.inst 0x05a46011 //zip1 z17.s,z0.s,z4.s +.inst 0x05a46412 //zip2 z18.s,z0.s,z4.s +.inst 0x05ac6113 //zip1 z19.s,z8.s,z12.s +.inst 0x05ac6514 //zip2 z20.s,z8.s,z12.s + +.inst 0x05a56035 //zip1 z21.s,z1.s,z5.s +.inst 0x05a56436 //zip2 z22.s,z1.s,z5.s +.inst 0x05ad6137 //zip1 z23.s,z9.s,z13.s +.inst 0x05ad6538 //zip2 z24.s,z9.s,z13.s + +.inst 0x05f36220 //zip1 z0.d,z17.d,z19.d +.inst 0x05f36624 //zip2 z4.d,z17.d,z19.d +.inst 0x05f46248 //zip1 z8.d,z18.d,z20.d +.inst 0x05f4664c //zip2 z12.d,z18.d,z20.d + +.inst 0x05f762a1 //zip1 z1.d,z21.d,z23.d +.inst 0x05f766a5 //zip2 z5.d,z21.d,z23.d +.inst 0x05f862c9 //zip1 z9.d,z22.d,z24.d +.inst 0x05f866cd //zip2 z13.d,z22.d,z24.d +.if mixin == 1 + eor x15,x15,x16 +.endif +.if mixin == 1 + eor x17,x17,x18 +.endif +.inst 0x05a66051 //zip1 z17.s,z2.s,z6.s +.inst 0x05a66452 //zip2 z18.s,z2.s,z6.s +.inst 0x05ae6153 //zip1 z19.s,z10.s,z14.s +.inst 0x05ae6554 //zip2 z20.s,z10.s,z14.s + +.inst 0x05a76075 //zip1 z21.s,z3.s,z7.s +.inst 0x05a76476 //zip2 z22.s,z3.s,z7.s +.inst 0x05af6177 //zip1 z23.s,z11.s,z15.s +.inst 0x05af6578 //zip2 z24.s,z11.s,z15.s + +.inst 0x05f36222 //zip1 z2.d,z17.d,z19.d +.inst 0x05f36626 //zip2 z6.d,z17.d,z19.d +.inst 0x05f4624a //zip1 z10.d,z18.d,z20.d +.inst 0x05f4664e //zip2 z14.d,z18.d,z20.d + +.inst 0x05f762a3 //zip1 z3.d,z21.d,z23.d +.inst 0x05f766a7 //zip2 z7.d,z21.d,z23.d +.inst 0x05f862cb //zip1 z11.d,z22.d,z24.d +.inst 0x05f866cf //zip2 z15.d,z22.d,z24.d +.if mixin == 1 + eor x19,x19,x20 +.endif +.if mixin == 1 + eor x21,x21,x22 +.endif +.inst 0xa540a031 //ld1w {z17.s},p0/z,[x1,#0,MUL VL] +.inst 0xa541a032 //ld1w {z18.s},p0/z,[x1,#1,MUL VL] +.inst 0xa542a033 //ld1w {z19.s},p0/z,[x1,#2,MUL VL] +.inst 0xa543a034 //ld1w {z20.s},p0/z,[x1,#3,MUL VL] +.inst 0xa544a035 //ld1w {z21.s},p0/z,[x1,#4,MUL VL] +.inst 0xa545a036 //ld1w {z22.s},p0/z,[x1,#5,MUL VL] +.inst 0xa546a037 //ld1w {z23.s},p0/z,[x1,#6,MUL VL] +.inst 0xa547a038 //ld1w {z24.s},p0/z,[x1,#7,MUL VL] +.inst 0x04215101 //addvl x1,x1,8 +.inst 0x04b13000 //eor z0.d,z0.d,z17.d +.inst 0x04b23084 //eor z4.d,z4.d,z18.d +.inst 0x04b33108 //eor z8.d,z8.d,z19.d +.inst 0x04b4318c //eor z12.d,z12.d,z20.d +.inst 0x04b53021 //eor z1.d,z1.d,z21.d +.inst 0x04b630a5 //eor z5.d,z5.d,z22.d +.inst 0x04b73129 //eor z9.d,z9.d,z23.d +.inst 0x04b831ad //eor z13.d,z13.d,z24.d +.inst 0xa540a031 //ld1w {z17.s},p0/z,[x1,#0,MUL VL] +.inst 0xa541a032 //ld1w {z18.s},p0/z,[x1,#1,MUL VL] +.inst 0xa542a033 //ld1w {z19.s},p0/z,[x1,#2,MUL VL] +.inst 0xa543a034 //ld1w {z20.s},p0/z,[x1,#3,MUL VL] +.inst 0xa544a035 //ld1w {z21.s},p0/z,[x1,#4,MUL VL] +.inst 0xa545a036 //ld1w {z22.s},p0/z,[x1,#5,MUL VL] +.inst 0xa546a037 //ld1w {z23.s},p0/z,[x1,#6,MUL VL] +.inst 0xa547a038 //ld1w {z24.s},p0/z,[x1,#7,MUL VL] +.inst 0x04215101 //addvl x1,x1,8 +.if mixin == 1 + stp x7,x9,[x0],#16 +.endif +.inst 0x04b13042 //eor z2.d,z2.d,z17.d +.inst 0x04b230c6 //eor z6.d,z6.d,z18.d +.if mixin == 1 + stp x11,x13,[x0],#16 +.endif +.inst 0x04b3314a //eor z10.d,z10.d,z19.d +.inst 0x04b431ce //eor z14.d,z14.d,z20.d +.if mixin == 1 + stp x15,x17,[x0],#16 +.endif +.inst 0x04b53063 //eor z3.d,z3.d,z21.d +.inst 0x04b630e7 //eor z7.d,z7.d,z22.d +.if mixin == 1 + stp x19,x21,[x0],#16 +.endif +.inst 0x04b7316b //eor z11.d,z11.d,z23.d +.inst 0x04b831ef //eor z15.d,z15.d,z24.d +.inst 0xe540e000 //st1w {z0.s},p0,[x0,#0,MUL VL] +.inst 0xe541e004 //st1w {z4.s},p0,[x0,#1,MUL VL] +.inst 0xe542e008 //st1w {z8.s},p0,[x0,#2,MUL VL] +.inst 0xe543e00c //st1w {z12.s},p0,[x0,#3,MUL VL] +.inst 0xe544e001 //st1w {z1.s},p0,[x0,#4,MUL VL] +.inst 0xe545e005 //st1w {z5.s},p0,[x0,#5,MUL VL] +.inst 0xe546e009 //st1w {z9.s},p0,[x0,#6,MUL VL] +.inst 0xe547e00d //st1w {z13.s},p0,[x0,#7,MUL VL] +.inst 0x04205100 //addvl x0,x0,8 +.inst 0xe540e002 //st1w {z2.s},p0,[x0,#0,MUL VL] +.inst 0xe541e006 //st1w {z6.s},p0,[x0,#1,MUL VL] +.inst 0xe542e00a //st1w {z10.s},p0,[x0,#2,MUL VL] +.inst 0xe543e00e //st1w {z14.s},p0,[x0,#3,MUL VL] +.inst 0xe544e003 //st1w {z3.s},p0,[x0,#4,MUL VL] +.inst 0xe545e007 //st1w {z7.s},p0,[x0,#5,MUL VL] +.inst 0xe546e00b //st1w {z11.s},p0,[x0,#6,MUL VL] +.inst 0xe547e00f //st1w {z15.s},p0,[x0,#7,MUL VL] +.inst 0x04205100 //addvl x0,x0,8 +210: +.inst 0x04b0e3fd //incw x29, ALL, MUL #1 + subs x2,x2,64 + b.gt 100b + b 110f +101: + mixin=0 + lsr x8,x23,#32 +.inst 0x05a03ae0 //dup z0.s,w23 +.inst 0x05a03af9 //dup z25.s,w23 +.if mixin == 1 + mov w7,w23 +.endif +.inst 0x05a03904 //dup z4.s,w8 +.inst 0x05a0391a //dup z26.s,w8 + lsr x10,x24,#32 +.inst 0x05a03b08 //dup z8.s,w24 +.inst 0x05a03b1b //dup z27.s,w24 +.if mixin == 1 + mov w9,w24 +.endif +.inst 0x05a0394c //dup z12.s,w10 +.inst 0x05a0395c //dup z28.s,w10 + lsr x12,x25,#32 +.inst 0x05a03b21 //dup z1.s,w25 +.inst 0x05a03b3d //dup z29.s,w25 +.if mixin == 1 + mov w11,w25 +.endif +.inst 0x05a03985 //dup z5.s,w12 +.inst 0x05a0399e //dup z30.s,w12 + lsr x14,x26,#32 +.inst 0x05a03b49 //dup z9.s,w26 +.inst 0x05a03b55 //dup z21.s,w26 +.if mixin == 1 + mov w13,w26 +.endif +.inst 0x05a039cd //dup z13.s,w14 +.inst 0x05a039d6 //dup z22.s,w14 + lsr x16,x27,#32 +.inst 0x05a03b62 //dup z2.s,w27 +.inst 0x05a03b77 //dup z23.s,w27 +.if mixin == 1 + mov w15,w27 +.endif +.inst 0x05a03a06 //dup z6.s,w16 +.inst 0x05a03a18 //dup z24.s,w16 + lsr x18,x28,#32 +.inst 0x05a03b8a //dup z10.s,w28 +.if mixin == 1 + mov w17,w28 +.endif +.inst 0x05a03a4e //dup z14.s,w18 + lsr x22,x30,#32 +.inst 0x05a03bcb //dup z11.s,w30 +.if mixin == 1 + mov w21,w30 +.endif +.inst 0x05a03acf //dup z15.s,w22 +.if mixin == 1 + add w20,w29,#1 + mov w19,w29 +.inst 0x04a14690 //index z16.s,w20,1 +.inst 0x04a14683 //index z3.s,w20,1 +.else +.inst 0x04a147b0 //index z16.s,w29,1 +.inst 0x04a147a3 //index z3.s,w29,1 +.endif + lsr x20,x29,#32 +.inst 0x05a03a87 //dup z7.s,w20 + mov x6,#10 +10: +.align 5 +.inst 0x04a10000 //add z0.s,z0.s,z1.s +.if mixin == 1 + add w7,w7,w11 +.endif +.inst 0x04a50084 //add z4.s,z4.s,z5.s +.if mixin == 1 + add w8,w8,w12 +.endif +.inst 0x04a90108 //add z8.s,z8.s,z9.s +.if mixin == 1 + add w9,w9,w13 +.endif +.inst 0x04ad018c //add z12.s,z12.s,z13.s +.if mixin == 1 + add w10,w10,w14 +.endif +.inst 0x04a03063 //eor z3.d,z3.d,z0.d +.if mixin == 1 + eor w19,w19,w7 +.endif +.inst 0x04a430e7 //eor z7.d,z7.d,z4.d +.if mixin == 1 + eor w20,w20,w8 +.endif +.inst 0x04a8316b //eor z11.d,z11.d,z8.d +.if mixin == 1 + eor w21,w21,w9 +.endif +.inst 0x04ac31ef //eor z15.d,z15.d,z12.d +.if mixin == 1 + eor w22,w22,w10 +.endif +.inst 0x05a58063 //revh z3.s,p0/m,z3.s +.if mixin == 1 + ror w19,w19,#16 +.endif +.inst 0x05a580e7 //revh z7.s,p0/m,z7.s +.if mixin == 1 + ror w20,w20,#16 +.endif +.inst 0x05a5816b //revh z11.s,p0/m,z11.s +.if mixin == 1 + ror w21,w21,#16 +.endif +.inst 0x05a581ef //revh z15.s,p0/m,z15.s +.if mixin == 1 + ror w22,w22,#16 +.endif +.inst 0x04a30042 //add z2.s,z2.s,z3.s +.if mixin == 1 + add w15,w15,w19 +.endif +.inst 0x04a700c6 //add z6.s,z6.s,z7.s +.if mixin == 1 + add w16,w16,w20 +.endif +.inst 0x04ab014a //add z10.s,z10.s,z11.s +.if mixin == 1 + add w17,w17,w21 +.endif +.inst 0x04af01ce //add z14.s,z14.s,z15.s +.if mixin == 1 + add w18,w18,w22 +.endif +.inst 0x04a23021 //eor z1.d,z1.d,z2.d +.if mixin == 1 + eor w11,w11,w15 +.endif +.inst 0x04a630a5 //eor z5.d,z5.d,z6.d +.if mixin == 1 + eor w12,w12,w16 +.endif +.inst 0x04aa3129 //eor z9.d,z9.d,z10.d +.if mixin == 1 + eor w13,w13,w17 +.endif +.inst 0x04ae31ad //eor z13.d,z13.d,z14.d +.if mixin == 1 + eor w14,w14,w18 +.endif +.inst 0x046c9c31 //lsl z17.s,z1.s,12 +.inst 0x046c9cb2 //lsl z18.s,z5.s,12 +.inst 0x046c9d33 //lsl z19.s,z9.s,12 +.inst 0x046c9db4 //lsl z20.s,z13.s,12 +.inst 0x046c9421 //lsr z1.s,z1.s,20 +.if mixin == 1 + ror w11,w11,20 +.endif +.inst 0x046c94a5 //lsr z5.s,z5.s,20 +.if mixin == 1 + ror w12,w12,20 +.endif +.inst 0x046c9529 //lsr z9.s,z9.s,20 +.if mixin == 1 + ror w13,w13,20 +.endif +.inst 0x046c95ad //lsr z13.s,z13.s,20 +.if mixin == 1 + ror w14,w14,20 +.endif +.inst 0x04713021 //orr z1.d,z1.d,z17.d +.inst 0x047230a5 //orr z5.d,z5.d,z18.d +.inst 0x04733129 //orr z9.d,z9.d,z19.d +.inst 0x047431ad //orr z13.d,z13.d,z20.d +.inst 0x04a10000 //add z0.s,z0.s,z1.s +.if mixin == 1 + add w7,w7,w11 +.endif +.inst 0x04a50084 //add z4.s,z4.s,z5.s +.if mixin == 1 + add w8,w8,w12 +.endif +.inst 0x04a90108 //add z8.s,z8.s,z9.s +.if mixin == 1 + add w9,w9,w13 +.endif +.inst 0x04ad018c //add z12.s,z12.s,z13.s +.if mixin == 1 + add w10,w10,w14 +.endif +.inst 0x04a03063 //eor z3.d,z3.d,z0.d +.if mixin == 1 + eor w19,w19,w7 +.endif +.inst 0x04a430e7 //eor z7.d,z7.d,z4.d +.if mixin == 1 + eor w20,w20,w8 +.endif +.inst 0x04a8316b //eor z11.d,z11.d,z8.d +.if mixin == 1 + eor w21,w21,w9 +.endif +.inst 0x04ac31ef //eor z15.d,z15.d,z12.d +.if mixin == 1 + eor w22,w22,w10 +.endif +.inst 0x053f3063 //tbl z3.b,{z3.b},z31.b +.if mixin == 1 + ror w19,w19,#24 +.endif +.inst 0x053f30e7 //tbl z7.b,{z7.b},z31.b +.if mixin == 1 + ror w20,w20,#24 +.endif +.inst 0x053f316b //tbl z11.b,{z11.b},z31.b +.if mixin == 1 + ror w21,w21,#24 +.endif +.inst 0x053f31ef //tbl z15.b,{z15.b},z31.b +.if mixin == 1 + ror w22,w22,#24 +.endif +.inst 0x04a30042 //add z2.s,z2.s,z3.s +.if mixin == 1 + add w15,w15,w19 +.endif +.inst 0x04a700c6 //add z6.s,z6.s,z7.s +.if mixin == 1 + add w16,w16,w20 +.endif +.inst 0x04ab014a //add z10.s,z10.s,z11.s +.if mixin == 1 + add w17,w17,w21 +.endif +.inst 0x04af01ce //add z14.s,z14.s,z15.s +.if mixin == 1 + add w18,w18,w22 +.endif +.inst 0x04a23021 //eor z1.d,z1.d,z2.d +.if mixin == 1 + eor w11,w11,w15 +.endif +.inst 0x04a630a5 //eor z5.d,z5.d,z6.d +.if mixin == 1 + eor w12,w12,w16 +.endif +.inst 0x04aa3129 //eor z9.d,z9.d,z10.d +.if mixin == 1 + eor w13,w13,w17 +.endif +.inst 0x04ae31ad //eor z13.d,z13.d,z14.d +.if mixin == 1 + eor w14,w14,w18 +.endif +.inst 0x04679c31 //lsl z17.s,z1.s,7 +.inst 0x04679cb2 //lsl z18.s,z5.s,7 +.inst 0x04679d33 //lsl z19.s,z9.s,7 +.inst 0x04679db4 //lsl z20.s,z13.s,7 +.inst 0x04679421 //lsr z1.s,z1.s,25 +.if mixin == 1 + ror w11,w11,25 +.endif +.inst 0x046794a5 //lsr z5.s,z5.s,25 +.if mixin == 1 + ror w12,w12,25 +.endif +.inst 0x04679529 //lsr z9.s,z9.s,25 +.if mixin == 1 + ror w13,w13,25 +.endif +.inst 0x046795ad //lsr z13.s,z13.s,25 +.if mixin == 1 + ror w14,w14,25 +.endif +.inst 0x04713021 //orr z1.d,z1.d,z17.d +.inst 0x047230a5 //orr z5.d,z5.d,z18.d +.inst 0x04733129 //orr z9.d,z9.d,z19.d +.inst 0x047431ad //orr z13.d,z13.d,z20.d +.inst 0x04a50000 //add z0.s,z0.s,z5.s +.if mixin == 1 + add w7,w7,w12 +.endif +.inst 0x04a90084 //add z4.s,z4.s,z9.s +.if mixin == 1 + add w8,w8,w13 +.endif +.inst 0x04ad0108 //add z8.s,z8.s,z13.s +.if mixin == 1 + add w9,w9,w14 +.endif +.inst 0x04a1018c //add z12.s,z12.s,z1.s +.if mixin == 1 + add w10,w10,w11 +.endif +.inst 0x04a031ef //eor z15.d,z15.d,z0.d +.if mixin == 1 + eor w22,w22,w7 +.endif +.inst 0x04a43063 //eor z3.d,z3.d,z4.d +.if mixin == 1 + eor w19,w19,w8 +.endif +.inst 0x04a830e7 //eor z7.d,z7.d,z8.d +.if mixin == 1 + eor w20,w20,w9 +.endif +.inst 0x04ac316b //eor z11.d,z11.d,z12.d +.if mixin == 1 + eor w21,w21,w10 +.endif +.inst 0x05a581ef //revh z15.s,p0/m,z15.s +.if mixin == 1 + ror w22,w22,#16 +.endif +.inst 0x05a58063 //revh z3.s,p0/m,z3.s +.if mixin == 1 + ror w19,w19,#16 +.endif +.inst 0x05a580e7 //revh z7.s,p0/m,z7.s +.if mixin == 1 + ror w20,w20,#16 +.endif +.inst 0x05a5816b //revh z11.s,p0/m,z11.s +.if mixin == 1 + ror w21,w21,#16 +.endif +.inst 0x04af014a //add z10.s,z10.s,z15.s +.if mixin == 1 + add w17,w17,w22 +.endif +.inst 0x04a301ce //add z14.s,z14.s,z3.s +.if mixin == 1 + add w18,w18,w19 +.endif +.inst 0x04a70042 //add z2.s,z2.s,z7.s +.if mixin == 1 + add w15,w15,w20 +.endif +.inst 0x04ab00c6 //add z6.s,z6.s,z11.s +.if mixin == 1 + add w16,w16,w21 +.endif +.inst 0x04aa30a5 //eor z5.d,z5.d,z10.d +.if mixin == 1 + eor w12,w12,w17 +.endif +.inst 0x04ae3129 //eor z9.d,z9.d,z14.d +.if mixin == 1 + eor w13,w13,w18 +.endif +.inst 0x04a231ad //eor z13.d,z13.d,z2.d +.if mixin == 1 + eor w14,w14,w15 +.endif +.inst 0x04a63021 //eor z1.d,z1.d,z6.d +.if mixin == 1 + eor w11,w11,w16 +.endif +.inst 0x046c9cb1 //lsl z17.s,z5.s,12 +.inst 0x046c9d32 //lsl z18.s,z9.s,12 +.inst 0x046c9db3 //lsl z19.s,z13.s,12 +.inst 0x046c9c34 //lsl z20.s,z1.s,12 +.inst 0x046c94a5 //lsr z5.s,z5.s,20 +.if mixin == 1 + ror w12,w12,20 +.endif +.inst 0x046c9529 //lsr z9.s,z9.s,20 +.if mixin == 1 + ror w13,w13,20 +.endif +.inst 0x046c95ad //lsr z13.s,z13.s,20 +.if mixin == 1 + ror w14,w14,20 +.endif +.inst 0x046c9421 //lsr z1.s,z1.s,20 +.if mixin == 1 + ror w11,w11,20 +.endif +.inst 0x047130a5 //orr z5.d,z5.d,z17.d +.inst 0x04723129 //orr z9.d,z9.d,z18.d +.inst 0x047331ad //orr z13.d,z13.d,z19.d +.inst 0x04743021 //orr z1.d,z1.d,z20.d +.inst 0x04a50000 //add z0.s,z0.s,z5.s +.if mixin == 1 + add w7,w7,w12 +.endif +.inst 0x04a90084 //add z4.s,z4.s,z9.s +.if mixin == 1 + add w8,w8,w13 +.endif +.inst 0x04ad0108 //add z8.s,z8.s,z13.s +.if mixin == 1 + add w9,w9,w14 +.endif +.inst 0x04a1018c //add z12.s,z12.s,z1.s +.if mixin == 1 + add w10,w10,w11 +.endif +.inst 0x04a031ef //eor z15.d,z15.d,z0.d +.if mixin == 1 + eor w22,w22,w7 +.endif +.inst 0x04a43063 //eor z3.d,z3.d,z4.d +.if mixin == 1 + eor w19,w19,w8 +.endif +.inst 0x04a830e7 //eor z7.d,z7.d,z8.d +.if mixin == 1 + eor w20,w20,w9 +.endif +.inst 0x04ac316b //eor z11.d,z11.d,z12.d +.if mixin == 1 + eor w21,w21,w10 +.endif +.inst 0x053f31ef //tbl z15.b,{z15.b},z31.b +.if mixin == 1 + ror w22,w22,#24 +.endif +.inst 0x053f3063 //tbl z3.b,{z3.b},z31.b +.if mixin == 1 + ror w19,w19,#24 +.endif +.inst 0x053f30e7 //tbl z7.b,{z7.b},z31.b +.if mixin == 1 + ror w20,w20,#24 +.endif +.inst 0x053f316b //tbl z11.b,{z11.b},z31.b +.if mixin == 1 + ror w21,w21,#24 +.endif +.inst 0x04af014a //add z10.s,z10.s,z15.s +.if mixin == 1 + add w17,w17,w22 +.endif +.inst 0x04a301ce //add z14.s,z14.s,z3.s +.if mixin == 1 + add w18,w18,w19 +.endif +.inst 0x04a70042 //add z2.s,z2.s,z7.s +.if mixin == 1 + add w15,w15,w20 +.endif +.inst 0x04ab00c6 //add z6.s,z6.s,z11.s +.if mixin == 1 + add w16,w16,w21 +.endif +.inst 0x04aa30a5 //eor z5.d,z5.d,z10.d +.if mixin == 1 + eor w12,w12,w17 +.endif +.inst 0x04ae3129 //eor z9.d,z9.d,z14.d +.if mixin == 1 + eor w13,w13,w18 +.endif +.inst 0x04a231ad //eor z13.d,z13.d,z2.d +.if mixin == 1 + eor w14,w14,w15 +.endif +.inst 0x04a63021 //eor z1.d,z1.d,z6.d +.if mixin == 1 + eor w11,w11,w16 +.endif +.inst 0x04679cb1 //lsl z17.s,z5.s,7 +.inst 0x04679d32 //lsl z18.s,z9.s,7 +.inst 0x04679db3 //lsl z19.s,z13.s,7 +.inst 0x04679c34 //lsl z20.s,z1.s,7 +.inst 0x046794a5 //lsr z5.s,z5.s,25 +.if mixin == 1 + ror w12,w12,25 +.endif +.inst 0x04679529 //lsr z9.s,z9.s,25 +.if mixin == 1 + ror w13,w13,25 +.endif +.inst 0x046795ad //lsr z13.s,z13.s,25 +.if mixin == 1 + ror w14,w14,25 +.endif +.inst 0x04679421 //lsr z1.s,z1.s,25 +.if mixin == 1 + ror w11,w11,25 +.endif +.inst 0x047130a5 //orr z5.d,z5.d,z17.d +.inst 0x04723129 //orr z9.d,z9.d,z18.d +.inst 0x047331ad //orr z13.d,z13.d,z19.d +.inst 0x04743021 //orr z1.d,z1.d,z20.d + sub x6,x6,1 + cbnz x6,10b + lsr x6,x28,#32 +.inst 0x05a03b91 //dup z17.s,w28 +.inst 0x05a038d2 //dup z18.s,w6 + lsr x6,x29,#32 +.inst 0x05a038d3 //dup z19.s,w6 + lsr x6,x30,#32 +.if mixin == 1 + add w7,w7,w23 +.endif +.inst 0x04b90000 //add z0.s,z0.s,z25.s +.if mixin == 1 + add x8,x8,x23,lsr #32 +.endif +.inst 0x04ba0084 //add z4.s,z4.s,z26.s +.if mixin == 1 + add x7,x7,x8,lsl #32 // pack +.endif +.if mixin == 1 + add w9,w9,w24 +.endif +.inst 0x04bb0108 //add z8.s,z8.s,z27.s +.if mixin == 1 + add x10,x10,x24,lsr #32 +.endif +.inst 0x04bc018c //add z12.s,z12.s,z28.s +.if mixin == 1 + add x9,x9,x10,lsl #32 // pack +.endif +.if mixin == 1 + ldp x8,x10,[x1],#16 +.endif +.if mixin == 1 + add w11,w11,w25 +.endif +.inst 0x04bd0021 //add z1.s,z1.s,z29.s +.if mixin == 1 + add x12,x12,x25,lsr #32 +.endif +.inst 0x04be00a5 //add z5.s,z5.s,z30.s +.if mixin == 1 + add x11,x11,x12,lsl #32 // pack +.endif +.if mixin == 1 + add w13,w13,w26 +.endif +.inst 0x04b50129 //add z9.s,z9.s,z21.s +.if mixin == 1 + add x14,x14,x26,lsr #32 +.endif +.inst 0x04b601ad //add z13.s,z13.s,z22.s +.if mixin == 1 + add x13,x13,x14,lsl #32 // pack +.endif +.if mixin == 1 + ldp x12,x14,[x1],#16 +.endif +.if mixin == 1 + add w15,w15,w27 +.endif +.inst 0x04b70042 //add z2.s,z2.s,z23.s +.if mixin == 1 + add x16,x16,x27,lsr #32 +.endif +.inst 0x04b800c6 //add z6.s,z6.s,z24.s +.if mixin == 1 + add x15,x15,x16,lsl #32 // pack +.endif +.if mixin == 1 + add w17,w17,w28 +.endif +.inst 0x04b1014a //add z10.s,z10.s,z17.s +.if mixin == 1 + add x18,x18,x28,lsr #32 +.endif +.inst 0x04b201ce //add z14.s,z14.s,z18.s +.if mixin == 1 + add x17,x17,x18,lsl #32 // pack +.endif +.if mixin == 1 + ldp x16,x18,[x1],#16 +.endif +.inst 0x05a03bd4 //dup z20.s,w30 +.inst 0x05a038d9 //dup z25.s,w6 // bak[15] not available for SVE +.if mixin == 1 + add w19,w19,w29 +.endif +.inst 0x04b00063 //add z3.s,z3.s,z16.s +.if mixin == 1 + add x20,x20,x29,lsr #32 +.endif +.inst 0x04b300e7 //add z7.s,z7.s,z19.s +.if mixin == 1 + add x19,x19,x20,lsl #32 // pack +.endif +.if mixin == 1 + add w21,w21,w30 +.endif +.inst 0x04b4016b //add z11.s,z11.s,z20.s +.if mixin == 1 + add x22,x22,x30,lsr #32 +.endif +.inst 0x04b901ef //add z15.s,z15.s,z25.s +.if mixin == 1 + add x21,x21,x22,lsl #32 // pack +.endif +.if mixin == 1 + ldp x20,x22,[x1],#16 +.endif +#ifdef __AARCH64EB__ + rev x7,x7 +.inst 0x05a48000 //revb z0.s,p0/m,z0.s +.inst 0x05a48084 //revb z4.s,p0/m,z4.s + rev x9,x9 +.inst 0x05a48108 //revb z8.s,p0/m,z8.s +.inst 0x05a4818c //revb z12.s,p0/m,z12.s + rev x11,x11 +.inst 0x05a48021 //revb z1.s,p0/m,z1.s +.inst 0x05a480a5 //revb z5.s,p0/m,z5.s + rev x13,x13 +.inst 0x05a48129 //revb z9.s,p0/m,z9.s +.inst 0x05a481ad //revb z13.s,p0/m,z13.s + rev x15,x15 +.inst 0x05a48042 //revb z2.s,p0/m,z2.s +.inst 0x05a480c6 //revb z6.s,p0/m,z6.s + rev x17,x17 +.inst 0x05a4814a //revb z10.s,p0/m,z10.s +.inst 0x05a481ce //revb z14.s,p0/m,z14.s + rev x19,x19 +.inst 0x05a48063 //revb z3.s,p0/m,z3.s +.inst 0x05a480e7 //revb z7.s,p0/m,z7.s + rev x21,x21 +.inst 0x05a4816b //revb z11.s,p0/m,z11.s +.inst 0x05a481ef //revb z15.s,p0/m,z15.s +#endif +.if mixin == 1 + add x29,x29,#1 +.endif + cmp x5,4 + b.ne 200f +.if mixin == 1 + eor x7,x7,x8 +.endif +.if mixin == 1 + eor x9,x9,x10 +.endif +.if mixin == 1 + eor x11,x11,x12 +.endif +.inst 0x05a46011 //zip1 z17.s,z0.s,z4.s +.inst 0x05a46412 //zip2 z18.s,z0.s,z4.s +.inst 0x05ac6113 //zip1 z19.s,z8.s,z12.s +.inst 0x05ac6514 //zip2 z20.s,z8.s,z12.s + +.inst 0x05a56035 //zip1 z21.s,z1.s,z5.s +.inst 0x05a56436 //zip2 z22.s,z1.s,z5.s +.inst 0x05ad6137 //zip1 z23.s,z9.s,z13.s +.inst 0x05ad6538 //zip2 z24.s,z9.s,z13.s + +.inst 0x05f36220 //zip1 z0.d,z17.d,z19.d +.inst 0x05f36624 //zip2 z4.d,z17.d,z19.d +.inst 0x05f46248 //zip1 z8.d,z18.d,z20.d +.inst 0x05f4664c //zip2 z12.d,z18.d,z20.d + +.inst 0x05f762a1 //zip1 z1.d,z21.d,z23.d +.inst 0x05f766a5 //zip2 z5.d,z21.d,z23.d +.inst 0x05f862c9 //zip1 z9.d,z22.d,z24.d +.inst 0x05f866cd //zip2 z13.d,z22.d,z24.d +.if mixin == 1 + eor x13,x13,x14 +.endif +.if mixin == 1 + eor x15,x15,x16 +.endif +.if mixin == 1 + eor x17,x17,x18 +.endif +.inst 0x05a66051 //zip1 z17.s,z2.s,z6.s +.inst 0x05a66452 //zip2 z18.s,z2.s,z6.s +.inst 0x05ae6153 //zip1 z19.s,z10.s,z14.s +.inst 0x05ae6554 //zip2 z20.s,z10.s,z14.s + +.inst 0x05a76075 //zip1 z21.s,z3.s,z7.s +.inst 0x05a76476 //zip2 z22.s,z3.s,z7.s +.inst 0x05af6177 //zip1 z23.s,z11.s,z15.s +.inst 0x05af6578 //zip2 z24.s,z11.s,z15.s + +.inst 0x05f36222 //zip1 z2.d,z17.d,z19.d +.inst 0x05f36626 //zip2 z6.d,z17.d,z19.d +.inst 0x05f4624a //zip1 z10.d,z18.d,z20.d +.inst 0x05f4664e //zip2 z14.d,z18.d,z20.d + +.inst 0x05f762a3 //zip1 z3.d,z21.d,z23.d +.inst 0x05f766a7 //zip2 z7.d,z21.d,z23.d +.inst 0x05f862cb //zip1 z11.d,z22.d,z24.d +.inst 0x05f866cf //zip2 z15.d,z22.d,z24.d +.if mixin == 1 + eor x19,x19,x20 +.endif +.if mixin == 1 + eor x21,x21,x22 +.endif + ld1 {v17.4s,v18.4s,v19.4s,v20.4s},[x1],#64 + ld1 {v21.4s,v22.4s,v23.4s,v24.4s},[x1],#64 +.inst 0x04b13000 //eor z0.d,z0.d,z17.d +.inst 0x04b23021 //eor z1.d,z1.d,z18.d +.inst 0x04b33042 //eor z2.d,z2.d,z19.d +.inst 0x04b43063 //eor z3.d,z3.d,z20.d +.inst 0x04b53084 //eor z4.d,z4.d,z21.d +.inst 0x04b630a5 //eor z5.d,z5.d,z22.d +.inst 0x04b730c6 //eor z6.d,z6.d,z23.d +.inst 0x04b830e7 //eor z7.d,z7.d,z24.d + ld1 {v17.4s,v18.4s,v19.4s,v20.4s},[x1],#64 + ld1 {v21.4s,v22.4s,v23.4s,v24.4s},[x1],#64 +.if mixin == 1 + stp x7,x9,[x0],#16 +.endif +.inst 0x04b13108 //eor z8.d,z8.d,z17.d +.inst 0x04b23129 //eor z9.d,z9.d,z18.d +.if mixin == 1 + stp x11,x13,[x0],#16 +.endif +.inst 0x04b3314a //eor z10.d,z10.d,z19.d +.inst 0x04b4316b //eor z11.d,z11.d,z20.d +.if mixin == 1 + stp x15,x17,[x0],#16 +.endif +.inst 0x04b5318c //eor z12.d,z12.d,z21.d +.inst 0x04b631ad //eor z13.d,z13.d,z22.d +.if mixin == 1 + stp x19,x21,[x0],#16 +.endif +.inst 0x04b731ce //eor z14.d,z14.d,z23.d +.inst 0x04b831ef //eor z15.d,z15.d,z24.d + st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x0],#64 + st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64 + st1 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64 + st1 {v12.4s,v13.4s,v14.4s,v15.4s},[x0],#64 + b 210f +200: +.inst 0x05a16011 //zip1 z17.s,z0.s,z1.s +.inst 0x05a16412 //zip2 z18.s,z0.s,z1.s +.inst 0x05a36053 //zip1 z19.s,z2.s,z3.s +.inst 0x05a36454 //zip2 z20.s,z2.s,z3.s + +.inst 0x05a56095 //zip1 z21.s,z4.s,z5.s +.inst 0x05a56496 //zip2 z22.s,z4.s,z5.s +.inst 0x05a760d7 //zip1 z23.s,z6.s,z7.s +.inst 0x05a764d8 //zip2 z24.s,z6.s,z7.s + +.inst 0x05f36220 //zip1 z0.d,z17.d,z19.d +.inst 0x05f36621 //zip2 z1.d,z17.d,z19.d +.inst 0x05f46242 //zip1 z2.d,z18.d,z20.d +.inst 0x05f46643 //zip2 z3.d,z18.d,z20.d + +.inst 0x05f762a4 //zip1 z4.d,z21.d,z23.d +.inst 0x05f766a5 //zip2 z5.d,z21.d,z23.d +.inst 0x05f862c6 //zip1 z6.d,z22.d,z24.d +.inst 0x05f866c7 //zip2 z7.d,z22.d,z24.d +.if mixin == 1 + eor x7,x7,x8 +.endif +.if mixin == 1 + eor x9,x9,x10 +.endif +.inst 0x05a96111 //zip1 z17.s,z8.s,z9.s +.inst 0x05a96512 //zip2 z18.s,z8.s,z9.s +.inst 0x05ab6153 //zip1 z19.s,z10.s,z11.s +.inst 0x05ab6554 //zip2 z20.s,z10.s,z11.s + +.inst 0x05ad6195 //zip1 z21.s,z12.s,z13.s +.inst 0x05ad6596 //zip2 z22.s,z12.s,z13.s +.inst 0x05af61d7 //zip1 z23.s,z14.s,z15.s +.inst 0x05af65d8 //zip2 z24.s,z14.s,z15.s + +.inst 0x05f36228 //zip1 z8.d,z17.d,z19.d +.inst 0x05f36629 //zip2 z9.d,z17.d,z19.d +.inst 0x05f4624a //zip1 z10.d,z18.d,z20.d +.inst 0x05f4664b //zip2 z11.d,z18.d,z20.d + +.inst 0x05f762ac //zip1 z12.d,z21.d,z23.d +.inst 0x05f766ad //zip2 z13.d,z21.d,z23.d +.inst 0x05f862ce //zip1 z14.d,z22.d,z24.d +.inst 0x05f866cf //zip2 z15.d,z22.d,z24.d +.if mixin == 1 + eor x11,x11,x12 +.endif +.if mixin == 1 + eor x13,x13,x14 +.endif +.inst 0x05a46011 //zip1 z17.s,z0.s,z4.s +.inst 0x05a46412 //zip2 z18.s,z0.s,z4.s +.inst 0x05ac6113 //zip1 z19.s,z8.s,z12.s +.inst 0x05ac6514 //zip2 z20.s,z8.s,z12.s + +.inst 0x05a56035 //zip1 z21.s,z1.s,z5.s +.inst 0x05a56436 //zip2 z22.s,z1.s,z5.s +.inst 0x05ad6137 //zip1 z23.s,z9.s,z13.s +.inst 0x05ad6538 //zip2 z24.s,z9.s,z13.s + +.inst 0x05f36220 //zip1 z0.d,z17.d,z19.d +.inst 0x05f36624 //zip2 z4.d,z17.d,z19.d +.inst 0x05f46248 //zip1 z8.d,z18.d,z20.d +.inst 0x05f4664c //zip2 z12.d,z18.d,z20.d + +.inst 0x05f762a1 //zip1 z1.d,z21.d,z23.d +.inst 0x05f766a5 //zip2 z5.d,z21.d,z23.d +.inst 0x05f862c9 //zip1 z9.d,z22.d,z24.d +.inst 0x05f866cd //zip2 z13.d,z22.d,z24.d +.if mixin == 1 + eor x15,x15,x16 +.endif +.if mixin == 1 + eor x17,x17,x18 +.endif +.inst 0x05a66051 //zip1 z17.s,z2.s,z6.s +.inst 0x05a66452 //zip2 z18.s,z2.s,z6.s +.inst 0x05ae6153 //zip1 z19.s,z10.s,z14.s +.inst 0x05ae6554 //zip2 z20.s,z10.s,z14.s + +.inst 0x05a76075 //zip1 z21.s,z3.s,z7.s +.inst 0x05a76476 //zip2 z22.s,z3.s,z7.s +.inst 0x05af6177 //zip1 z23.s,z11.s,z15.s +.inst 0x05af6578 //zip2 z24.s,z11.s,z15.s + +.inst 0x05f36222 //zip1 z2.d,z17.d,z19.d +.inst 0x05f36626 //zip2 z6.d,z17.d,z19.d +.inst 0x05f4624a //zip1 z10.d,z18.d,z20.d +.inst 0x05f4664e //zip2 z14.d,z18.d,z20.d + +.inst 0x05f762a3 //zip1 z3.d,z21.d,z23.d +.inst 0x05f766a7 //zip2 z7.d,z21.d,z23.d +.inst 0x05f862cb //zip1 z11.d,z22.d,z24.d +.inst 0x05f866cf //zip2 z15.d,z22.d,z24.d +.if mixin == 1 + eor x19,x19,x20 +.endif +.if mixin == 1 + eor x21,x21,x22 +.endif +.inst 0xa540a031 //ld1w {z17.s},p0/z,[x1,#0,MUL VL] +.inst 0xa541a032 //ld1w {z18.s},p0/z,[x1,#1,MUL VL] +.inst 0xa542a033 //ld1w {z19.s},p0/z,[x1,#2,MUL VL] +.inst 0xa543a034 //ld1w {z20.s},p0/z,[x1,#3,MUL VL] +.inst 0xa544a035 //ld1w {z21.s},p0/z,[x1,#4,MUL VL] +.inst 0xa545a036 //ld1w {z22.s},p0/z,[x1,#5,MUL VL] +.inst 0xa546a037 //ld1w {z23.s},p0/z,[x1,#6,MUL VL] +.inst 0xa547a038 //ld1w {z24.s},p0/z,[x1,#7,MUL VL] +.inst 0x04215101 //addvl x1,x1,8 +.inst 0x04b13000 //eor z0.d,z0.d,z17.d +.inst 0x04b23084 //eor z4.d,z4.d,z18.d +.inst 0x04b33108 //eor z8.d,z8.d,z19.d +.inst 0x04b4318c //eor z12.d,z12.d,z20.d +.inst 0x04b53021 //eor z1.d,z1.d,z21.d +.inst 0x04b630a5 //eor z5.d,z5.d,z22.d +.inst 0x04b73129 //eor z9.d,z9.d,z23.d +.inst 0x04b831ad //eor z13.d,z13.d,z24.d +.inst 0xa540a031 //ld1w {z17.s},p0/z,[x1,#0,MUL VL] +.inst 0xa541a032 //ld1w {z18.s},p0/z,[x1,#1,MUL VL] +.inst 0xa542a033 //ld1w {z19.s},p0/z,[x1,#2,MUL VL] +.inst 0xa543a034 //ld1w {z20.s},p0/z,[x1,#3,MUL VL] +.inst 0xa544a035 //ld1w {z21.s},p0/z,[x1,#4,MUL VL] +.inst 0xa545a036 //ld1w {z22.s},p0/z,[x1,#5,MUL VL] +.inst 0xa546a037 //ld1w {z23.s},p0/z,[x1,#6,MUL VL] +.inst 0xa547a038 //ld1w {z24.s},p0/z,[x1,#7,MUL VL] +.inst 0x04215101 //addvl x1,x1,8 +.if mixin == 1 + stp x7,x9,[x0],#16 +.endif +.inst 0x04b13042 //eor z2.d,z2.d,z17.d +.inst 0x04b230c6 //eor z6.d,z6.d,z18.d +.if mixin == 1 + stp x11,x13,[x0],#16 +.endif +.inst 0x04b3314a //eor z10.d,z10.d,z19.d +.inst 0x04b431ce //eor z14.d,z14.d,z20.d +.if mixin == 1 + stp x15,x17,[x0],#16 +.endif +.inst 0x04b53063 //eor z3.d,z3.d,z21.d +.inst 0x04b630e7 //eor z7.d,z7.d,z22.d +.if mixin == 1 + stp x19,x21,[x0],#16 +.endif +.inst 0x04b7316b //eor z11.d,z11.d,z23.d +.inst 0x04b831ef //eor z15.d,z15.d,z24.d +.inst 0xe540e000 //st1w {z0.s},p0,[x0,#0,MUL VL] +.inst 0xe541e004 //st1w {z4.s},p0,[x0,#1,MUL VL] +.inst 0xe542e008 //st1w {z8.s},p0,[x0,#2,MUL VL] +.inst 0xe543e00c //st1w {z12.s},p0,[x0,#3,MUL VL] +.inst 0xe544e001 //st1w {z1.s},p0,[x0,#4,MUL VL] +.inst 0xe545e005 //st1w {z5.s},p0,[x0,#5,MUL VL] +.inst 0xe546e009 //st1w {z9.s},p0,[x0,#6,MUL VL] +.inst 0xe547e00d //st1w {z13.s},p0,[x0,#7,MUL VL] +.inst 0x04205100 //addvl x0,x0,8 +.inst 0xe540e002 //st1w {z2.s},p0,[x0,#0,MUL VL] +.inst 0xe541e006 //st1w {z6.s},p0,[x0,#1,MUL VL] +.inst 0xe542e00a //st1w {z10.s},p0,[x0,#2,MUL VL] +.inst 0xe543e00e //st1w {z14.s},p0,[x0,#3,MUL VL] +.inst 0xe544e003 //st1w {z3.s},p0,[x0,#4,MUL VL] +.inst 0xe545e007 //st1w {z7.s},p0,[x0,#5,MUL VL] +.inst 0xe546e00b //st1w {z11.s},p0,[x0,#6,MUL VL] +.inst 0xe547e00f //st1w {z15.s},p0,[x0,#7,MUL VL] +.inst 0x04205100 //addvl x0,x0,8 +210: +.inst 0x04b0e3fd //incw x29, ALL, MUL #1 +110: +2: + str w29,[x4] + ldp d10,d11,[sp,16] + ldp d12,d13,[sp,32] + ldp d14,d15,[sp,48] + ldp x16,x17,[sp,64] + ldp x18,x19,[sp,80] + ldp x20,x21,[sp,96] + ldp x22,x23,[sp,112] + ldp x24,x25,[sp,128] + ldp x26,x27,[sp,144] + ldp x28,x29,[sp,160] + ldr x30,[sp,176] + ldp d8,d9,[sp],192 + AARCH64_VALIDATE_LINK_REGISTER +.Lreturn: + ret +.size ChaCha20_ctr32_sve,.-ChaCha20_ctr32_sve diff --git a/sys/crypto/openssl/aarch64/chacha-armv8.S b/sys/crypto/openssl/aarch64/chacha-armv8.S index 4f9d6bd372f7..ee32415ad4c3 100644 --- a/sys/crypto/openssl/aarch64/chacha-armv8.S +++ b/sys/crypto/openssl/aarch64/chacha-armv8.S @@ -3,9 +3,11 @@ #ifndef __KERNEL__ .hidden OPENSSL_armcap_P + + #endif -.text +.section .rodata .align 5 .Lsigma: @@ -17,18 +19,19 @@ .byte 67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0 .align 2 -.globl ChaCha20_ctr32 -.type ChaCha20_ctr32,%function +.text + +.globl ChaCha20_ctr32_dflt +.type ChaCha20_ctr32_dflt,%function .align 5 -ChaCha20_ctr32: +ChaCha20_ctr32_dflt: AARCH64_SIGN_LINK_REGISTER - cbz x2,.Labort cmp x2,#192 b.lo .Lshort - #ifndef __KERNEL__ adrp x17,OPENSSL_armcap_P ldr w17,[x17,#:lo12:OPENSSL_armcap_P] +.Lcheck_neon: tst w17,#ARMV7_NEON b.ne .LChaCha20_neon #endif @@ -37,7 +40,8 @@ ChaCha20_ctr32: stp x29,x30,[sp,#-96]! add x29,sp,#0 - adr x5,.Lsigma + adrp x5,.Lsigma + add x5,x5,#:lo12:.Lsigma stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] @@ -304,6 +308,41 @@ ChaCha20_ctr32: ldp x29,x30,[sp],#96 AARCH64_VALIDATE_LINK_REGISTER ret +.size ChaCha20_ctr32_dflt,.-ChaCha20_ctr32_dflt + +.globl ChaCha20_ctr32 +.type ChaCha20_ctr32,%function +.align 5 +ChaCha20_ctr32: + AARCH64_SIGN_LINK_REGISTER + cbz x2,.Labort + cmp x2,#192 + b.lo .Lshort +#ifndef __KERNEL__ + adrp x17,OPENSSL_armcap_P + ldr w17,[x17,#:lo12:OPENSSL_armcap_P] + tst w17,#ARMV8_SVE + b.eq .Lcheck_neon + stp x29,x30,[sp,#-16]! + sub sp,sp,#16 + // SVE handling will inevitably increment the counter + // Neon/Scalar code that follows to process tail data needs to + // use new counter, unfortunately the input counter buffer + // pointed to by ctr is meant to be read-only per API contract + // we have to copy the buffer to stack to be writable by SVE + ldp x5,x6,[x4] + stp x5,x6,[sp] + mov x4,sp + bl ChaCha20_ctr32_sve + cbz x2,1f + bl ChaCha20_ctr32_dflt +1: + add sp,sp,#16 + ldp x29,x30,[sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret +#endif + b .Lshort .size ChaCha20_ctr32,.-ChaCha20_ctr32 #ifdef __KERNEL__ @@ -317,7 +356,8 @@ ChaCha20_neon: stp x29,x30,[sp,#-96]! add x29,sp,#0 - adr x5,.Lsigma + adrp x5,.Lsigma + add x5,x5,#:lo12:.Lsigma stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] @@ -890,7 +930,8 @@ ChaCha20_512_neon: stp x29,x30,[sp,#-96]! add x29,sp,#0 - adr x5,.Lsigma + adrp x5,.Lsigma + add x5,x5,#:lo12:.Lsigma stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] diff --git a/sys/crypto/openssl/aarch64/ecp_nistz256-armv8.S b/sys/crypto/openssl/aarch64/ecp_nistz256-armv8.S index 73c367bcf1fc..688187ddcf43 100644 --- a/sys/crypto/openssl/aarch64/ecp_nistz256-armv8.S +++ b/sys/crypto/openssl/aarch64/ecp_nistz256-armv8.S @@ -1,7 +1,7 @@ /* Do not modify. This file is auto-generated from ecp_nistz256-armv8.pl. */ #include "arm_arch.h" -.text +.section .rodata .globl ecp_nistz256_precomputed .type ecp_nistz256_precomputed,%object .align 12 @@ -2391,6 +2391,8 @@ ecp_nistz256_precomputed: .byte 69,67,80,95,78,73,83,84,90,50,53,54,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 2 +.text + // void ecp_nistz256_to_mont(BN_ULONG x0[4],const BN_ULONG x1[4]); .globl ecp_nistz256_to_mont .type ecp_nistz256_to_mont,%function @@ -2401,12 +2403,16 @@ ecp_nistz256_to_mont: add x29,sp,#0 stp x19,x20,[sp,#16] - ldr x3,.LRR // bp[0] + adrp x3,.LRR + ldr x3,[x3,#:lo12:.LRR] // bp[0] ldp x4,x5,[x1] ldp x6,x7,[x1,#16] - ldr x12,.Lpoly+8 - ldr x13,.Lpoly+24 - adr x2,.LRR // &bp[0] + adrp x13,.Lpoly + add x13,x13,#:lo12:.Lpoly + ldr x12,[x13,#8] + ldr x13,[x13,#24] + adrp x2,.LRR + add x2,x2,#:lo12:.LRR bl __ecp_nistz256_mul_mont @@ -2429,9 +2435,12 @@ ecp_nistz256_from_mont: mov x3,#1 // bp[0] ldp x4,x5,[x1] ldp x6,x7,[x1,#16] - ldr x12,.Lpoly+8 - ldr x13,.Lpoly+24 - adr x2,.Lone // &bp[0] + adrp x13,.Lpoly + add x13,x13,#:lo12:.Lpoly + ldr x12,[x13,#8] + ldr x13,[x13,#24] + adrp x2,.Lone + add x2,x2,#:lo12:.Lone bl __ecp_nistz256_mul_mont @@ -2455,8 +2464,10 @@ ecp_nistz256_mul_mont: ldr x3,[x2] // bp[0] ldp x4,x5,[x1] ldp x6,x7,[x1,#16] - ldr x12,.Lpoly+8 - ldr x13,.Lpoly+24 + adrp x13,.Lpoly + add x13,x13,#:lo12:.Lpoly + ldr x12,[x13,#8] + ldr x13,[x13,#24] bl __ecp_nistz256_mul_mont @@ -2478,8 +2489,10 @@ ecp_nistz256_sqr_mont: ldp x4,x5,[x1] ldp x6,x7,[x1,#16] - ldr x12,.Lpoly+8 - ldr x13,.Lpoly+24 + adrp x13,.Lpoly + add x13,x13,#:lo12:.Lpoly + ldr x12,[x13,#8] + ldr x13,[x13,#24] bl __ecp_nistz256_sqr_mont @@ -2503,8 +2516,10 @@ ecp_nistz256_add: ldp x8,x9,[x2] ldp x16,x17,[x1,#16] ldp x10,x11,[x2,#16] - ldr x12,.Lpoly+8 - ldr x13,.Lpoly+24 + adrp x13,.Lpoly + add x13,x13,#:lo12:.Lpoly + ldr x12,[x13,#8] + ldr x13,[x13,#24] bl __ecp_nistz256_add @@ -2524,8 +2539,10 @@ ecp_nistz256_div_by_2: ldp x14,x15,[x1] ldp x16,x17,[x1,#16] - ldr x12,.Lpoly+8 - ldr x13,.Lpoly+24 + adrp x13,.Lpoly + add x13,x13,#:lo12:.Lpoly + ldr x12,[x13,#8] + ldr x13,[x13,#24] bl __ecp_nistz256_div_by_2 @@ -2545,8 +2562,10 @@ ecp_nistz256_mul_by_2: ldp x14,x15,[x1] ldp x16,x17,[x1,#16] - ldr x12,.Lpoly+8 - ldr x13,.Lpoly+24 + adrp x13,.Lpoly + add x13,x13,#:lo12:.Lpoly + ldr x12,[x13,#8] + ldr x13,[x13,#24] mov x8,x14 mov x9,x15 mov x10,x16 @@ -2570,8 +2589,10 @@ ecp_nistz256_mul_by_3: ldp x14,x15,[x1] ldp x16,x17,[x1,#16] - ldr x12,.Lpoly+8 - ldr x13,.Lpoly+24 + adrp x13,.Lpoly + add x13,x13,#:lo12:.Lpoly + ldr x12,[x13,#8] + ldr x13,[x13,#24] mov x8,x14 mov x9,x15 mov x10,x16 @@ -2607,8 +2628,10 @@ ecp_nistz256_sub: ldp x14,x15,[x1] ldp x16,x17,[x1,#16] - ldr x12,.Lpoly+8 - ldr x13,.Lpoly+24 + adrp x13,.Lpoly + add x13,x13,#:lo12:.Lpoly + ldr x12,[x13,#8] + ldr x13,[x13,#24] bl __ecp_nistz256_sub_from @@ -2631,8 +2654,10 @@ ecp_nistz256_neg: mov x15,xzr mov x16,xzr mov x17,xzr - ldr x12,.Lpoly+8 - ldr x13,.Lpoly+24 + adrp x13,.Lpoly + add x13,x13,#:lo12:.Lpoly + ldr x12,[x13,#8] + ldr x13,[x13,#24] bl __ecp_nistz256_sub_from @@ -3027,9 +3052,11 @@ ecp_nistz256_point_double: mov x21,x0 ldp x16,x17,[x1,#48] mov x22,x1 - ldr x12,.Lpoly+8 + adrp x13,.Lpoly + add x13,x13,#:lo12:.Lpoly + ldr x12,[x13,#8] mov x8,x14 - ldr x13,.Lpoly+24 + ldr x13,[x13,#24] mov x9,x15 ldp x4,x5,[x22,#64] // forward load for p256_sqr_mont mov x10,x16 @@ -3172,8 +3199,10 @@ ecp_nistz256_point_add: mov x21,x0 mov x22,x1 mov x23,x2 - ldr x12,.Lpoly+8 - ldr x13,.Lpoly+24 + adrp x13,.Lpoly + add x13,x13,#:lo12:.Lpoly + ldr x12,[x13,#8] + ldr x13,[x13,#24] orr x8,x4,x5 orr x10,x6,x7 orr x25,x8,x10 @@ -3423,8 +3452,10 @@ ecp_nistz256_point_add_affine: mov x21,x0 mov x22,x1 mov x23,x2 - ldr x12,.Lpoly+8 - ldr x13,.Lpoly+24 + adrp x13,.Lpoly + add x13,x13,#:lo12:.Lpoly + ldr x12,[x13,#8] + ldr x13,[x13,#24] ldp x4,x5,[x1,#64] // in1_z ldp x6,x7,[x1,#64+16] @@ -3570,7 +3601,8 @@ ecp_nistz256_point_add_affine: ldp x10,x11,[x23,#0+48] stp x14,x15,[x21,#0] stp x16,x17,[x21,#0+16] - adr x23,.Lone_mont-64 + adrp x23,.Lone_mont-64 + add x23,x23,#:lo12:.Lone_mont-64 ldp x14,x15,[x22,#32] // in1 cmp x24,#0 // ~, remember? ldp x16,x17,[x22,#32+16] @@ -3628,7 +3660,8 @@ ecp_nistz256_ord_mul_mont: stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] - adr x23,.Lord + adrp x23,.Lord + add x23,x23,#:lo12:.Lord ldr x3,[x2] // bp[0] ldp x4,x5,[x1] ldp x6,x7,[x1,#16] @@ -3838,7 +3871,8 @@ ecp_nistz256_ord_sqr_mont: stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] - adr x23,.Lord + adrp x23,.Lord + add x23,x23,#:lo12:.Lord ldp x4,x5,[x1] ldp x6,x7,[x1,#16] diff --git a/sys/crypto/openssl/aarch64/ecp_sm2p256-armv8.S b/sys/crypto/openssl/aarch64/ecp_sm2p256-armv8.S new file mode 100644 index 000000000000..c9d925a7bc77 --- /dev/null +++ b/sys/crypto/openssl/aarch64/ecp_sm2p256-armv8.S @@ -0,0 +1,837 @@ +/* Do not modify. This file is auto-generated from ecp_sm2p256-armv8.pl. */ +#include "arm_arch.h" +.arch armv8-a +.section .rodata + +.align 5 +// The polynomial p +.Lpoly: +.quad 0xffffffffffffffff,0xffffffff00000000,0xffffffffffffffff,0xfffffffeffffffff +// The order of polynomial n +.Lord: +.quad 0x53bbf40939d54123,0x7203df6b21c6052b,0xffffffffffffffff,0xfffffffeffffffff +// (p + 1) / 2 +.Lpoly_div_2: +.quad 0x8000000000000000,0xffffffff80000000,0xffffffffffffffff,0x7fffffff7fffffff +// (n + 1) / 2 +.Lord_div_2: +.quad 0xa9ddfa049ceaa092,0xb901efb590e30295,0xffffffffffffffff,0x7fffffff7fffffff + +.text + +// void bn_rshift1(BN_ULONG *a); +.globl bn_rshift1 +.type bn_rshift1,%function +.align 5 +bn_rshift1: + AARCH64_VALID_CALL_TARGET + // Load inputs + ldp x7,x8,[x0] + ldp x9,x10,[x0,#16] + + // Right shift + extr x7,x8,x7,#1 + extr x8,x9,x8,#1 + extr x9,x10,x9,#1 + lsr x10,x10,#1 + + // Store results + stp x7,x8,[x0] + stp x9,x10,[x0,#16] + + ret +.size bn_rshift1,.-bn_rshift1 + +// void bn_sub(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b); +.globl bn_sub +.type bn_sub,%function +.align 5 +bn_sub: + AARCH64_VALID_CALL_TARGET + // Load inputs + ldp x7,x8,[x1] + ldp x9,x10,[x1,#16] + ldp x11,x12,[x2] + ldp x13,x14,[x2,#16] + + // Subtraction + subs x7,x7,x11 + sbcs x8,x8,x12 + sbcs x9,x9,x13 + sbc x10,x10,x14 + + // Store results + stp x7,x8,[x0] + stp x9,x10,[x0,#16] + + ret +.size bn_sub,.-bn_sub + +// void ecp_sm2p256_div_by_2(BN_ULONG *r,const BN_ULONG *a); +.globl ecp_sm2p256_div_by_2 +.type ecp_sm2p256_div_by_2,%function +.align 5 +ecp_sm2p256_div_by_2: + AARCH64_VALID_CALL_TARGET + // Load inputs + ldp x7,x8,[x1] + ldp x9,x10,[x1,#16] + + // Save the least significant bit + mov x3,x7 + + // Right shift 1 + extr x7,x8,x7,#1 + extr x8,x9,x8,#1 + extr x9,x10,x9,#1 + lsr x10,x10,#1 + + // Load mod + adrp x2,.Lpoly_div_2 + add x2,x2,#:lo12:.Lpoly_div_2 + ldp x11,x12,[x2] + ldp x13,x14,[x2,#16] + + // Parity check + tst x3,#1 + csel x11,xzr,x11,eq + csel x12,xzr,x12,eq + csel x13,xzr,x13,eq + csel x14,xzr,x14,eq + + // Add + adds x7,x7,x11 + adcs x8,x8,x12 + adcs x9,x9,x13 + adc x10,x10,x14 + + // Store results + stp x7,x8,[x0] + stp x9,x10,[x0,#16] + ret +.size ecp_sm2p256_div_by_2,.-ecp_sm2p256_div_by_2 + +// void ecp_sm2p256_div_by_2_mod_ord(BN_ULONG *r,const BN_ULONG *a); +.globl ecp_sm2p256_div_by_2_mod_ord +.type ecp_sm2p256_div_by_2_mod_ord,%function +.align 5 +ecp_sm2p256_div_by_2_mod_ord: + AARCH64_VALID_CALL_TARGET + // Load inputs + ldp x7,x8,[x1] + ldp x9,x10,[x1,#16] + + // Save the least significant bit + mov x3,x7 + + // Right shift 1 + extr x7,x8,x7,#1 + extr x8,x9,x8,#1 + extr x9,x10,x9,#1 + lsr x10,x10,#1 + + // Load mod + adrp x2,.Lord_div_2 + add x2,x2,#:lo12:.Lord_div_2 + ldp x11,x12,[x2] + ldp x13,x14,[x2,#16] + + // Parity check + tst x3,#1 + csel x11,xzr,x11,eq + csel x12,xzr,x12,eq + csel x13,xzr,x13,eq + csel x14,xzr,x14,eq + + // Add + adds x7,x7,x11 + adcs x8,x8,x12 + adcs x9,x9,x13 + adc x10,x10,x14 + + // Store results + stp x7,x8,[x0] + stp x9,x10,[x0,#16] + ret +.size ecp_sm2p256_div_by_2_mod_ord,.-ecp_sm2p256_div_by_2_mod_ord + +// void ecp_sm2p256_mul_by_3(BN_ULONG *r,const BN_ULONG *a); +.globl ecp_sm2p256_mul_by_3 +.type ecp_sm2p256_mul_by_3,%function +.align 5 +ecp_sm2p256_mul_by_3: + AARCH64_VALID_CALL_TARGET + // Load inputs + ldp x7,x8,[x1] + ldp x9,x10,[x1,#16] + + // 2*a + adds x7,x7,x7 + adcs x8,x8,x8 + adcs x9,x9,x9 + adcs x10,x10,x10 + adcs x15,xzr,xzr + + mov x3,x7 + mov x4,x8 + mov x5,x9 + mov x6,x10 + + // Sub polynomial + adrp x2,.Lpoly + add x2,x2,#:lo12:.Lpoly + ldp x11,x12,[x2] + ldp x13,x14,[x2,#16] + subs x7,x7,x11 + sbcs x8,x8,x12 + sbcs x9,x9,x13 + sbcs x10,x10,x14 + sbcs x15,x15,xzr + + csel x7,x7,x3,cs + csel x8,x8,x4,cs + csel x9,x9,x5,cs + csel x10,x10,x6,cs + eor x15,x15,x15 + + // 3*a + ldp x11,x12,[x1] + ldp x13,x14,[x1,#16] + adds x7,x7,x11 + adcs x8,x8,x12 + adcs x9,x9,x13 + adcs x10,x10,x14 + adcs x15,xzr,xzr + + mov x3,x7 + mov x4,x8 + mov x5,x9 + mov x6,x10 + + // Sub polynomial + adrp x2,.Lpoly + add x2,x2,#:lo12:.Lpoly + ldp x11,x12,[x2] + ldp x13,x14,[x2,#16] + subs x7,x7,x11 + sbcs x8,x8,x12 + sbcs x9,x9,x13 + sbcs x10,x10,x14 + sbcs x15,x15,xzr + + csel x7,x7,x3,cs + csel x8,x8,x4,cs + csel x9,x9,x5,cs + csel x10,x10,x6,cs + + // Store results + stp x7,x8,[x0] + stp x9,x10,[x0,#16] + + ret +.size ecp_sm2p256_mul_by_3,.-ecp_sm2p256_mul_by_3 + +// void ecp_sm2p256_add(BN_ULONG *r,const BN_ULONG *a,const BN_ULONG *b); +.globl ecp_sm2p256_add +.type ecp_sm2p256_add,%function +.align 5 +ecp_sm2p256_add: + AARCH64_VALID_CALL_TARGET + // Load inputs + ldp x7,x8,[x1] + ldp x9,x10,[x1,#16] + ldp x11,x12,[x2] + ldp x13,x14,[x2,#16] + + // Addition + adds x7,x7,x11 + adcs x8,x8,x12 + adcs x9,x9,x13 + adcs x10,x10,x14 + adc x15,xzr,xzr + + // Load polynomial + adrp x2,.Lpoly + add x2,x2,#:lo12:.Lpoly + ldp x11,x12,[x2] + ldp x13,x14,[x2,#16] + + // Backup Addition + mov x3,x7 + mov x4,x8 + mov x5,x9 + mov x6,x10 + + // Sub polynomial + subs x3,x3,x11 + sbcs x4,x4,x12 + sbcs x5,x5,x13 + sbcs x6,x6,x14 + sbcs x15,x15,xzr + + // Select based on carry + csel x7,x7,x3,cc + csel x8,x8,x4,cc + csel x9,x9,x5,cc + csel x10,x10,x6,cc + + // Store results + stp x7,x8,[x0] + stp x9,x10,[x0,#16] + ret +.size ecp_sm2p256_add,.-ecp_sm2p256_add + +// void ecp_sm2p256_sub(BN_ULONG *r,const BN_ULONG *a,const BN_ULONG *b); +.globl ecp_sm2p256_sub +.type ecp_sm2p256_sub,%function +.align 5 +ecp_sm2p256_sub: + AARCH64_VALID_CALL_TARGET + // Load inputs + ldp x7,x8,[x1] + ldp x9,x10,[x1,#16] + ldp x11,x12,[x2] + ldp x13,x14,[x2,#16] + + // Subtraction + subs x7,x7,x11 + sbcs x8,x8,x12 + sbcs x9,x9,x13 + sbcs x10,x10,x14 + sbc x15,xzr,xzr + + // Load polynomial + adrp x2,.Lpoly + add x2,x2,#:lo12:.Lpoly + ldp x11,x12,[x2] + ldp x13,x14,[x2,#16] + + // Backup subtraction + mov x3,x7 + mov x4,x8 + mov x5,x9 + mov x6,x10 + + // Add polynomial + adds x3,x3,x11 + adcs x4,x4,x12 + adcs x5,x5,x13 + adcs x6,x6,x14 + tst x15,x15 + + // Select based on carry + csel x7,x7,x3,eq + csel x8,x8,x4,eq + csel x9,x9,x5,eq + csel x10,x10,x6,eq + + // Store results + stp x7,x8,[x0] + stp x9,x10,[x0,#16] + ret +.size ecp_sm2p256_sub,.-ecp_sm2p256_sub + +// void ecp_sm2p256_sub_mod_ord(BN_ULONG *r,const BN_ULONG *a,const BN_ULONG *b); +.globl ecp_sm2p256_sub_mod_ord +.type ecp_sm2p256_sub_mod_ord,%function +.align 5 +ecp_sm2p256_sub_mod_ord: + AARCH64_VALID_CALL_TARGET + // Load inputs + ldp x7,x8,[x1] + ldp x9,x10,[x1,#16] + ldp x11,x12,[x2] + ldp x13,x14,[x2,#16] + + // Subtraction + subs x7,x7,x11 + sbcs x8,x8,x12 + sbcs x9,x9,x13 + sbcs x10,x10,x14 + sbc x15,xzr,xzr + + // Load polynomial + adrp x2,.Lord + add x2,x2,#:lo12:.Lord + ldp x11,x12,[x2] + ldp x13,x14,[x2,#16] + + // Backup subtraction + mov x3,x7 + mov x4,x8 + mov x5,x9 + mov x6,x10 + + // Add polynomial + adds x3,x3,x11 + adcs x4,x4,x12 + adcs x5,x5,x13 + adcs x6,x6,x14 + tst x15,x15 + + // Select based on carry + csel x7,x7,x3,eq + csel x8,x8,x4,eq + csel x9,x9,x5,eq + csel x10,x10,x6,eq + + // Store results + stp x7,x8,[x0] + stp x9,x10,[x0,#16] + ret +.size ecp_sm2p256_sub_mod_ord,.-ecp_sm2p256_sub_mod_ord + +.macro RDC + // a = | s7 | ... | s0 |, where si are 64-bit quantities + // = |a15|a14| ... |a1|a0|, where ai are 32-bit quantities + // | s7 | s6 | s5 | s4 | + // | a15 | a14 | a13 | a12 | a11 | a10 | a9 | a8 | + // | s3 | s2 | s1 | s0 | + // | a7 | a6 | a5 | a4 | a3 | a2 | a1 | a0 | + // ================================================= + // | a8 | a11 | a10 | a9 | a8 | 0 | s4 | (+) + // | a9 | a15 | s6 | a11 | 0 | a10 | a9 | (+) + // | a10 | 0 | a14 | a13 | a12 | 0 | s5 | (+) + // | a11 | 0 | s7 | a13 | 0 | a12 | a11 | (+) + // | a12 | 0 | s7 | a13 | 0 | s6 | (+) + // | a12 | 0 | 0 | a15 | a14 | 0 | a14 | a13 | (+) + // | a13 | 0 | 0 | 0 | a15 | 0 | a14 | a13 | (+) + // | a13 | 0 | 0 | 0 | 0 | 0 | s7 | (+) + // | a14 | 0 | 0 | 0 | 0 | 0 | s7 | (+) + // | a14 | 0 | 0 | 0 | 0 | 0 | 0 | a15 | (+) + // | a15 | 0 | 0 | 0 | 0 | 0 | 0 | a15 | (+) + // | a15 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | (+) + // | s7 | 0 | 0 | 0 | 0 | 0 | 0 | (+) + // | 0 | 0 | 0 | 0 | 0 | a8 | 0 | 0 | (-) + // | 0 | 0 | 0 | 0 | 0 | a9 | 0 | 0 | (-) + // | 0 | 0 | 0 | 0 | 0 | a13 | 0 | 0 | (-) + // | 0 | 0 | 0 | 0 | 0 | a14 | 0 | 0 | (-) + // | U[7]| U[6]| U[5]| U[4]| U[3]| U[2]| U[1]| U[0]| + // | V[3] | V[2] | V[1] | V[0] | + + // 1. 64-bit addition + // t2=s6+s7+s7 + adds x5,x13,x14 + adcs x4,xzr,xzr + adds x5,x5,x14 + adcs x4,x4,xzr + // t3=s4+s5+t2 + adds x6,x11,x5 + adcs x15,x4,xzr + adds x6,x6,x12 + adcs x15,x15,xzr + // sum + adds x7,x7,x6 + adcs x8,x8,x15 + adcs x9,x9,x5 + adcs x10,x10,x14 + adcs x3,xzr,xzr + adds x10,x10,x4 + adcs x3,x3,xzr + + stp x7,x8,[sp,#32] + stp x9,x10,[sp,#48] + + // 2. 64-bit to 32-bit spread + mov x4,#0xffffffff + mov x7,x11 + mov x8,x12 + mov x9,x13 + mov x10,x14 + and x7,x7,x4 // a8 + and x8,x8,x4 // a10 + and x9,x9,x4 // a12 + and x10,x10,x4 // a14 + lsr x11,x11,#32 // a9 + lsr x12,x12,#32 // a11 + lsr x13,x13,#32 // a13 + lsr x14,x14,#32 // a15 + + // 3. 32-bit addition + add x4,x10,x9 // t1 <- a12 + a14 + add x5,x14,x13 // t2 <- a13 + a15 + add x6,x7,x11 // t3 <- a8 + a9 + add x15,x10,x8 // t4 <- a10 + a14 + add x14,x14,x12 // a15 <- a11 + a15 + add x9,x5,x4 // a12 <- a12 + a13 + a14 + a15 + add x8,x8,x9 // a10 <- a10 + a12 + a13 + a14 + a15 + add x8,x8,x9 // a10 <- a10 + 2*(a12 + a13 + a14 + a15) + add x8,x8,x6 // a10 <- a8 + a9 + a10 + 2*(a12 + a13 + a14 + a15) + add x8,x8,x12 // a10 <- a8 + a9 + a10 + a11 + 2*(a12 + a13 + a14 + a15) + add x9,x9,x13 // a12 <- a12 + 2*a13 + a14 + a15 + add x9,x9,x12 // a12 <- a11 + a12 + 2*a13 + a14 + a15 + add x9,x9,x7 // a12 <- a8 + a11 + a12 + 2*a13 + a14 + a15 + add x6,x6,x10 // t3 <- a8 + a9 + a14 + add x6,x6,x13 // t3 <- a8 + a9 + a13 + a14 + add x11,x11,x5 // a9 <- a9 + a13 + a15 + add x12,x12,x11 // a11 <- a9 + a11 + a13 + a15 + add x12,x12,x5 // a11 <- a9 + a11 + 2*(a13 + a15) + add x4,x4,x15 // t1 <- a10 + a12 + 2*a14 + + // U[0] s5 a9 + a11 + 2*(a13 + a15) + // U[1] t1 a10 + a12 + 2*a14 + // U[2] -t3 a8 + a9 + a13 + a14 + // U[3] s2 a8 + a11 + a12 + 2*a13 + a14 + a15 + // U[4] s4 a9 + a13 + a15 + // U[5] t4 a10 + a14 + // U[6] s7 a11 + a15 + // U[7] s1 a8 + a9 + a10 + a11 + 2*(a12 + a13 + a14 + a15) + + // 4. 32-bit to 64-bit + lsl x7,x4,#32 + extr x4,x9,x4,#32 + extr x9,x15,x9,#32 + extr x15,x8,x15,#32 + lsr x8,x8,#32 + + // 5. 64-bit addition + adds x12,x12,x7 + adcs x4,x4,xzr + adcs x11,x11,x9 + adcs x14,x14,x15 + adcs x3,x3,x8 + + // V[0] s5 + // V[1] t1 + // V[2] s4 + // V[3] s7 + // carry t0 + // sub t3 + + // 5. Process s0-s3 + ldp x7,x8,[sp,#32] + ldp x9,x10,[sp,#48] + // add with V0-V3 + adds x7,x7,x12 + adcs x8,x8,x4 + adcs x9,x9,x11 + adcs x10,x10,x14 + adcs x3,x3,xzr + // sub with t3 + subs x8,x8,x6 + sbcs x9,x9,xzr + sbcs x10,x10,xzr + sbcs x3,x3,xzr + + // 6. MOD + // First Mod + lsl x4,x3,#32 + subs x5,x4,x3 + + adds x7,x7,x3 + adcs x8,x8,x5 + adcs x9,x9,xzr + adcs x10,x10,x4 + + // Last Mod + // return y - p if y > p else y + mov x11,x7 + mov x12,x8 + mov x13,x9 + mov x14,x10 + + adrp x3,.Lpoly + add x3,x3,#:lo12:.Lpoly + ldp x4,x5,[x3] + ldp x6,x15,[x3,#16] + + adcs x16,xzr,xzr + + subs x7,x7,x4 + sbcs x8,x8,x5 + sbcs x9,x9,x6 + sbcs x10,x10,x15 + sbcs x16,x16,xzr + + csel x7,x7,x11,cs + csel x8,x8,x12,cs + csel x9,x9,x13,cs + csel x10,x10,x14,cs + +.endm + +// void ecp_sm2p256_mul(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b); +.globl ecp_sm2p256_mul +.type ecp_sm2p256_mul,%function +.align 5 +ecp_sm2p256_mul: + AARCH64_SIGN_LINK_REGISTER + // Store scalar registers + stp x29,x30,[sp,#-80]! + add x29,sp,#0 + stp x16,x17,[sp,#16] + stp x19,x20,[sp,#64] + + // Load inputs + ldp x7,x8,[x1] + ldp x9,x10,[x1,#16] + ldp x11,x12,[x2] + ldp x13,x14,[x2,#16] + +// ### multiplication ### + // ======================== + // s3 s2 s1 s0 + // * s7 s6 s5 s4 + // ------------------------ + // + s0 s0 s0 s0 + // * * * * + // s7 s6 s5 s4 + // s1 s1 s1 s1 + // * * * * + // s7 s6 s5 s4 + // s2 s2 s2 s2 + // * * * * + // s7 s6 s5 s4 + // s3 s3 s3 s3 + // * * * * + // s7 s6 s5 s4 + // ------------------------ + // s7 s6 s5 s4 s3 s2 s1 s0 + // ======================== + +// ### s0*s4 ### + mul x16,x7,x11 + umulh x5,x7,x11 + +// ### s1*s4 + s0*s5 ### + mul x3,x8,x11 + umulh x4,x8,x11 + adds x5,x5,x3 + adcs x6,x4,xzr + + mul x3,x7,x12 + umulh x4,x7,x12 + adds x5,x5,x3 + adcs x6,x6,x4 + adcs x15,xzr,xzr + +// ### s2*s4 + s1*s5 + s0*s6 ### + mul x3,x9,x11 + umulh x4,x9,x11 + adds x6,x6,x3 + adcs x15,x15,x4 + + mul x3,x8,x12 + umulh x4,x8,x12 + adds x6,x6,x3 + adcs x15,x15,x4 + adcs x17,xzr,xzr + + mul x3,x7,x13 + umulh x4,x7,x13 + adds x6,x6,x3 + adcs x15,x15,x4 + adcs x17,x17,xzr + +// ### s3*s4 + s2*s5 + s1*s6 + s0*s7 ### + mul x3,x10,x11 + umulh x4,x10,x11 + adds x15,x15,x3 + adcs x17,x17,x4 + adcs x19,xzr,xzr + + mul x3,x9,x12 + umulh x4,x9,x12 + adds x15,x15,x3 + adcs x17,x17,x4 + adcs x19,x19,xzr + + mul x3,x8,x13 + umulh x4,x8,x13 + adds x15,x15,x3 + adcs x17,x17,x4 + adcs x19,x19,xzr + + mul x3,x7,x14 + umulh x4,x7,x14 + adds x15,x15,x3 + adcs x17,x17,x4 + adcs x19,x19,xzr + +// ### s3*s5 + s2*s6 + s1*s7 ### + mul x3,x10,x12 + umulh x4,x10,x12 + adds x17,x17,x3 + adcs x19,x19,x4 + adcs x20,xzr,xzr + + mul x3,x9,x13 + umulh x4,x9,x13 + adds x17,x17,x3 + adcs x19,x19,x4 + adcs x20,x20,xzr + + mul x3,x8,x14 + umulh x4,x8,x14 + adds x11,x17,x3 + adcs x19,x19,x4 + adcs x20,x20,xzr + +// ### s3*s6 + s2*s7 ### + mul x3,x10,x13 + umulh x4,x10,x13 + adds x19,x19,x3 + adcs x20,x20,x4 + adcs x17,xzr,xzr + + mul x3,x9,x14 + umulh x4,x9,x14 + adds x12,x19,x3 + adcs x20,x20,x4 + adcs x17,x17,xzr + +// ### s3*s7 ### + mul x3,x10,x14 + umulh x4,x10,x14 + adds x13,x20,x3 + adcs x14,x17,x4 + + mov x7,x16 + mov x8,x5 + mov x9,x6 + mov x10,x15 + + // result of mul: s7 s6 s5 s4 s3 s2 s1 s0 + +// ### Reduction ### + RDC + + stp x7,x8,[x0] + stp x9,x10,[x0,#16] + + // Restore scalar registers + ldp x16,x17,[sp,#16] + ldp x19,x20,[sp,#64] + ldp x29,x30,[sp],#80 + + AARCH64_VALIDATE_LINK_REGISTER + ret +.size ecp_sm2p256_mul,.-ecp_sm2p256_mul + +// void ecp_sm2p256_sqr(BN_ULONG *r, const BN_ULONG *a); +.globl ecp_sm2p256_sqr +.type ecp_sm2p256_sqr,%function +.align 5 + +ecp_sm2p256_sqr: + AARCH64_SIGN_LINK_REGISTER + // Store scalar registers + stp x29,x30,[sp,#-80]! + add x29,sp,#0 + stp x16,x17,[sp,#16] + stp x19,x20,[sp,#64] + + // Load inputs + ldp x11,x12,[x1] + ldp x13,x14,[x1,#16] + +// ### square ### + // ======================== + // s7 s6 s5 s4 + // * s7 s6 s5 s4 + // ------------------------ + // + s4 s4 s4 s4 + // * * * * + // s7 s6 s5 s4 + // s5 s5 s5 s5 + // * * * * + // s7 s6 s5 s4 + // s6 s6 s6 s6 + // * * * * + // s7 s6 s5 s4 + // s7 s7 s7 s7 + // * * * * + // s7 s6 s5 s4 + // ------------------------ + // s7 s6 s5 s4 s3 s2 s1 s0 + // ======================== + +// ### s4*s5 ### + mul x8,x11,x12 + umulh x9,x11,x12 + +// ### s4*s6 ### + mul x3,x13,x11 + umulh x10,x13,x11 + adds x9,x9,x3 + adcs x10,x10,xzr + +// ### s4*s7 + s5*s6 ### + mul x3,x14,x11 + umulh x4,x14,x11 + adds x10,x10,x3 + adcs x7,x4,xzr + + mul x3,x13,x12 + umulh x4,x13,x12 + adds x10,x10,x3 + adcs x7,x7,x4 + adcs x5,xzr,xzr + +// ### s5*s7 ### + mul x3,x14,x12 + umulh x4,x14,x12 + adds x7,x7,x3 + adcs x5,x5,x4 + +// ### s6*s7 ### + mul x3,x14,x13 + umulh x4,x14,x13 + adds x5,x5,x3 + adcs x6,x4,xzr + +// ### 2*(t3,t2,s0,s3,s2,s1) ### + adds x8,x8,x8 + adcs x9,x9,x9 + adcs x10,x10,x10 + adcs x7,x7,x7 + adcs x5,x5,x5 + adcs x6,x6,x6 + adcs x15,xzr,xzr + +// ### s4*s4 ### + mul x16,x11,x11 + umulh x17,x11,x11 + +// ### s5*s5 ### + mul x11,x12,x12 + umulh x12,x12,x12 + +// ### s6*s6 ### + mul x3,x13,x13 + umulh x4,x13,x13 + +// ### s7*s7 ### + mul x19,x14,x14 + umulh x20,x14,x14 + + adds x8,x8,x17 + adcs x9,x9,x11 + adcs x10,x10,x12 + adcs x7,x7,x3 + adcs x5,x5,x4 + adcs x6,x6,x19 + adcs x15,x15,x20 + + mov x11,x7 + mov x7,x16 + mov x12,x5 + mov x13,x6 + mov x14,x15 + + // result of mul: s7 s6 s5 s4 s3 s2 s1 s0 + +// ### Reduction ### + RDC + + stp x7,x8,[x0] + stp x9,x10,[x0,#16] + + // Restore scalar registers + ldp x16,x17,[sp,#16] + ldp x19,x20,[sp,#64] + ldp x29,x30,[sp],#80 + + AARCH64_VALIDATE_LINK_REGISTER + ret +.size ecp_sm2p256_sqr,.-ecp_sm2p256_sqr diff --git a/sys/crypto/openssl/aarch64/ghashv8-armx.S b/sys/crypto/openssl/aarch64/ghashv8-armx.S index 42f053d664ef..b92c6316eae5 100644 --- a/sys/crypto/openssl/aarch64/ghashv8-armx.S +++ b/sys/crypto/openssl/aarch64/ghashv8-armx.S @@ -84,15 +84,103 @@ gcm_init_v8: pmull v5.1q,v5.1d,v19.1d eor v18.16b,v18.16b,v2.16b eor v4.16b,v4.16b,v7.16b - eor v20.16b, v0.16b,v18.16b //H^3 - eor v22.16b,v5.16b,v4.16b //H^4 + eor v23.16b, v0.16b,v18.16b //H^3 + eor v25.16b,v5.16b,v4.16b //H^4 + + ext v16.16b,v23.16b, v23.16b,#8 //Karatsuba pre-processing + ext v17.16b,v25.16b,v25.16b,#8 + ext v18.16b,v22.16b,v22.16b,#8 + eor v16.16b,v16.16b,v23.16b + eor v17.16b,v17.16b,v25.16b + eor v18.16b,v18.16b,v22.16b + ext v24.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed + st1 {v23.2d,v24.2d,v25.2d},[x0],#48 //store Htable[3..5] + + //calculate H^5 and H^6 + pmull v0.1q,v22.1d, v23.1d + pmull v5.1q,v23.1d,v23.1d + pmull2 v2.1q,v22.2d, v23.2d + pmull2 v7.1q,v23.2d,v23.2d + pmull v1.1q,v16.1d,v18.1d + pmull v6.1q,v16.1d,v16.1d - ext v16.16b,v20.16b, v20.16b,#8 //Karatsuba pre-processing - ext v17.16b,v22.16b,v22.16b,#8 - eor v16.16b,v16.16b,v20.16b - eor v17.16b,v17.16b,v22.16b - ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed - st1 {v20.2d,v21.2d,v22.2d},[x0] //store Htable[3..5] + ext v16.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing + ext v17.16b,v5.16b,v7.16b,#8 + eor v18.16b,v0.16b,v2.16b + eor v1.16b,v1.16b,v16.16b + eor v4.16b,v5.16b,v7.16b + eor v6.16b,v6.16b,v17.16b + eor v1.16b,v1.16b,v18.16b + pmull v18.1q,v0.1d,v19.1d //1st phase + eor v6.16b,v6.16b,v4.16b + pmull v4.1q,v5.1d,v19.1d + + ins v2.d[0],v1.d[1] + ins v7.d[0],v6.d[1] + ins v1.d[1],v0.d[0] + ins v6.d[1],v5.d[0] + eor v0.16b,v1.16b,v18.16b + eor v5.16b,v6.16b,v4.16b + + ext v18.16b,v0.16b,v0.16b,#8 //2nd phase + ext v4.16b,v5.16b,v5.16b,#8 + pmull v0.1q,v0.1d,v19.1d + pmull v5.1q,v5.1d,v19.1d + eor v18.16b,v18.16b,v2.16b + eor v4.16b,v4.16b,v7.16b + eor v26.16b,v0.16b,v18.16b //H^5 + eor v28.16b,v5.16b,v4.16b //H^6 + + ext v16.16b,v26.16b, v26.16b,#8 //Karatsuba pre-processing + ext v17.16b,v28.16b,v28.16b,#8 + ext v18.16b,v22.16b,v22.16b,#8 + eor v16.16b,v16.16b,v26.16b + eor v17.16b,v17.16b,v28.16b + eor v18.16b,v18.16b,v22.16b + ext v27.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed + st1 {v26.2d,v27.2d,v28.2d},[x0],#48 //store Htable[6..8] + + //calculate H^7 and H^8 + pmull v0.1q,v22.1d,v26.1d + pmull v5.1q,v22.1d,v28.1d + pmull2 v2.1q,v22.2d,v26.2d + pmull2 v7.1q,v22.2d,v28.2d + pmull v1.1q,v16.1d,v18.1d + pmull v6.1q,v17.1d,v18.1d + + ext v16.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing + ext v17.16b,v5.16b,v7.16b,#8 + eor v18.16b,v0.16b,v2.16b + eor v1.16b,v1.16b,v16.16b + eor v4.16b,v5.16b,v7.16b + eor v6.16b,v6.16b,v17.16b + eor v1.16b,v1.16b,v18.16b + pmull v18.1q,v0.1d,v19.1d //1st phase + eor v6.16b,v6.16b,v4.16b + pmull v4.1q,v5.1d,v19.1d + + ins v2.d[0],v1.d[1] + ins v7.d[0],v6.d[1] + ins v1.d[1],v0.d[0] + ins v6.d[1],v5.d[0] + eor v0.16b,v1.16b,v18.16b + eor v5.16b,v6.16b,v4.16b + + ext v18.16b,v0.16b,v0.16b,#8 //2nd phase + ext v4.16b,v5.16b,v5.16b,#8 + pmull v0.1q,v0.1d,v19.1d + pmull v5.1q,v5.1d,v19.1d + eor v18.16b,v18.16b,v2.16b + eor v4.16b,v4.16b,v7.16b + eor v29.16b,v0.16b,v18.16b //H^7 + eor v31.16b,v5.16b,v4.16b //H^8 + + ext v16.16b,v29.16b,v29.16b,#8 //Karatsuba pre-processing + ext v17.16b,v31.16b,v31.16b,#8 + eor v16.16b,v16.16b,v29.16b + eor v17.16b,v17.16b,v31.16b + ext v30.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed + st1 {v29.2d,v30.2d,v31.2d},[x0] //store Htable[9..11] ret .size gcm_init_v8,.-gcm_init_v8 .globl gcm_gmult_v8 @@ -550,6 +638,7 @@ gcm_ghash_v8_4x: ret .size gcm_ghash_v8_4x,.-gcm_ghash_v8_4x +.section .rodata .byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 2 .align 2 diff --git a/sys/crypto/openssl/aarch64/keccak1600-armv8.S b/sys/crypto/openssl/aarch64/keccak1600-armv8.S index 08b3cc351213..e57e06f0f837 100644 --- a/sys/crypto/openssl/aarch64/keccak1600-armv8.S +++ b/sys/crypto/openssl/aarch64/keccak1600-armv8.S @@ -1,7 +1,7 @@ /* Do not modify. This file is auto-generated from keccak1600-armv8.pl. */ #include "arm_arch.h" -.text +.section .rodata .align 8 // strategic alignment and padding that allows to use // address value as loop termination condition... @@ -33,11 +33,14 @@ iotas: .quad 0x0000000080000001 .quad 0x8000000080008008 .size iotas,.-iotas +.text + .type KeccakF1600_int,%function .align 5 KeccakF1600_int: AARCH64_SIGN_LINK_REGISTER - adr x28,iotas + adrp x28,iotas + add x28,x28,#:lo12:iotas stp x28,x30,[sp,#16] // 32 bytes on top are mine b .Loop .align 4 @@ -517,6 +520,8 @@ SHA3_squeeze: mov x20,x1 mov x21,x2 mov x22,x3 + cmp w4, #0 // w4 = 'next' argument + bne .Lnext_block .Loop_squeeze: ldr x4,[x0],#8 @@ -531,7 +536,7 @@ SHA3_squeeze: subs x3,x3,#8 bhi .Loop_squeeze - +.Lnext_block: mov x0,x19 bl KeccakF1600 mov x0,x19 @@ -577,7 +582,8 @@ SHA3_squeeze: .align 5 KeccakF1600_ce: mov x9,#24 - adr x10,iotas + adrp x10,iotas + add x10,x10,#:lo12:iotas b .Loop_ce .align 4 .Loop_ce: diff --git a/sys/crypto/openssl/aarch64/md5-aarch64.S b/sys/crypto/openssl/aarch64/md5-aarch64.S new file mode 100644 index 000000000000..88e736e49687 --- /dev/null +++ b/sys/crypto/openssl/aarch64/md5-aarch64.S @@ -0,0 +1,678 @@ +/* Do not modify. This file is auto-generated from md5-aarch64.pl. */ +#include "arm_arch.h" + +.text +.globl ossl_md5_block_asm_data_order +.type ossl_md5_block_asm_data_order,@function +ossl_md5_block_asm_data_order: + AARCH64_VALID_CALL_TARGET + // Save all callee-saved registers + stp x19,x20,[sp,#-80]! + stp x21,x22,[sp,#16] + stp x23,x24,[sp,#32] + stp x25,x26,[sp,#48] + stp x27,x28,[sp,#64] + + ldp w10, w11, [x0, #0] // .Load MD5 state->A and state->B + ldp w12, w13, [x0, #8] // .Load MD5 state->C and state->D +.align 5 +ossl_md5_blocks_loop: + eor x17, x12, x13 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z) + and x16, x17, x11 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z) + ldp w15, w20, [x1] // .Load 2 words of input data0 M[0],M[1] + ldp w3, w21, [x1, #8] // .Load 2 words of input data0 M[2],M[3] +#ifdef __AARCH64EB__ + rev w15, w15 + rev w20, w20 + rev w3, w3 + rev w21, w21 +#endif + eor x14, x16, x13 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z) + movz x9, #0xa478 // .Load lower half of constant 0xd76aa478 + movk x9, #0xd76a, lsl #16 // .Load upper half of constant 0xd76aa478 + add w8, w10, w15 // Add dest value + add w7, w8, w9 // Add constant 0xd76aa478 + add w6, w7, w14 // Add aux function result + ror w6, w6, #25 // Rotate left s=7 bits + eor x5, x11, x12 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z) + add w4, w11, w6 // Add X parameter round 1 A=FF(A, B, C, D, 0xd76aa478, s=7, M[0]) + and x8, x5, x4 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z) + eor x17, x8, x12 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z) + movz x16, #0xb756 // .Load lower half of constant 0xe8c7b756 + movk x16, #0xe8c7, lsl #16 // .Load upper half of constant 0xe8c7b756 + add w9, w13, w20 // Add dest value + add w7, w9, w16 // Add constant 0xe8c7b756 + add w14, w7, w17 // Add aux function result + ror w14, w14, #20 // Rotate left s=12 bits + eor x6, x4, x11 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z) + add w5, w4, w14 // Add X parameter round 1 D=FF(D, A, B, C, 0xe8c7b756, s=12, M[1]) + and x8, x6, x5 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z) + eor x9, x8, x11 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z) + movz x16, #0x70db // .Load lower half of constant 0x242070db + movk x16, #0x2420, lsl #16 // .Load upper half of constant 0x242070db + add w7, w12, w3 // Add dest value + add w17, w7, w16 // Add constant 0x242070db + add w14, w17, w9 // Add aux function result + ror w14, w14, #15 // Rotate left s=17 bits + eor x6, x5, x4 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z) + add w8, w5, w14 // Add X parameter round 1 C=FF(C, D, A, B, 0x242070db, s=17, M[2]) + and x7, x6, x8 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z) + eor x16, x7, x4 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z) + movz x9, #0xceee // .Load lower half of constant 0xc1bdceee + movk x9, #0xc1bd, lsl #16 // .Load upper half of constant 0xc1bdceee + add w14, w11, w21 // Add dest value + add w6, w14, w9 // Add constant 0xc1bdceee + add w7, w6, w16 // Add aux function result + ror w7, w7, #10 // Rotate left s=22 bits + eor x17, x8, x5 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z) + add w9, w8, w7 // Add X parameter round 1 B=FF(B, C, D, A, 0xc1bdceee, s=22, M[3]) + ldp w14, w22, [x1, #16] // .Load 2 words of input data0 M[4],M[5] + ldp w7, w23, [x1, #24] // .Load 2 words of input data0 M[6],M[7] +#ifdef __AARCH64EB__ + rev w14, w14 + rev w22, w22 + rev w7, w7 + rev w23, w23 +#endif + and x16, x17, x9 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z) + eor x6, x16, x5 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z) + movz x16, #0xfaf // .Load lower half of constant 0xf57c0faf + movk x16, #0xf57c, lsl #16 // .Load upper half of constant 0xf57c0faf + add w17, w4, w14 // Add dest value + add w16, w17, w16 // Add constant 0xf57c0faf + add w4, w16, w6 // Add aux function result + ror w4, w4, #25 // Rotate left s=7 bits + eor x16, x9, x8 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z) + add w17, w9, w4 // Add X parameter round 1 A=FF(A, B, C, D, 0xf57c0faf, s=7, M[4]) + and x16, x16, x17 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z) + eor x6, x16, x8 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z) + movz x4, #0xc62a // .Load lower half of constant 0x4787c62a + movk x4, #0x4787, lsl #16 // .Load upper half of constant 0x4787c62a + add w16, w5, w22 // Add dest value + add w16, w16, w4 // Add constant 0x4787c62a + add w5, w16, w6 // Add aux function result + ror w5, w5, #20 // Rotate left s=12 bits + eor x4, x17, x9 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z) + add w19, w17, w5 // Add X parameter round 1 D=FF(D, A, B, C, 0x4787c62a, s=12, M[5]) + and x6, x4, x19 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z) + eor x5, x6, x9 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z) + movz x4, #0x4613 // .Load lower half of constant 0xa8304613 + movk x4, #0xa830, lsl #16 // .Load upper half of constant 0xa8304613 + add w6, w8, w7 // Add dest value + add w8, w6, w4 // Add constant 0xa8304613 + add w4, w8, w5 // Add aux function result + ror w4, w4, #15 // Rotate left s=17 bits + eor x6, x19, x17 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z) + add w8, w19, w4 // Add X parameter round 1 C=FF(C, D, A, B, 0xa8304613, s=17, M[6]) + and x5, x6, x8 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z) + eor x4, x5, x17 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z) + movz x6, #0x9501 // .Load lower half of constant 0xfd469501 + movk x6, #0xfd46, lsl #16 // .Load upper half of constant 0xfd469501 + add w9, w9, w23 // Add dest value + add w5, w9, w6 // Add constant 0xfd469501 + add w9, w5, w4 // Add aux function result + ror w9, w9, #10 // Rotate left s=22 bits + eor x6, x8, x19 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z) + add w4, w8, w9 // Add X parameter round 1 B=FF(B, C, D, A, 0xfd469501, s=22, M[7]) + ldp w5, w24, [x1, #32] // .Load 2 words of input data0 M[8],M[9] + ldp w16, w25, [x1, #40] // .Load 2 words of input data0 M[10],M[11] +#ifdef __AARCH64EB__ + rev w5, w5 + rev w24, w24 + rev w16, w16 + rev w25, w25 +#endif + and x9, x6, x4 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z) + eor x6, x9, x19 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z) + movz x9, #0x98d8 // .Load lower half of constant 0x698098d8 + movk x9, #0x6980, lsl #16 // .Load upper half of constant 0x698098d8 + add w17, w17, w5 // Add dest value + add w9, w17, w9 // Add constant 0x698098d8 + add w17, w9, w6 // Add aux function result + ror w17, w17, #25 // Rotate left s=7 bits + eor x9, x4, x8 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z) + add w6, w4, w17 // Add X parameter round 1 A=FF(A, B, C, D, 0x698098d8, s=7, M[8]) + and x17, x9, x6 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z) + eor x9, x17, x8 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z) + movz x17, #0xf7af // .Load lower half of constant 0x8b44f7af + movk x17, #0x8b44, lsl #16 // .Load upper half of constant 0x8b44f7af + add w19, w19, w24 // Add dest value + add w17, w19, w17 // Add constant 0x8b44f7af + add w19, w17, w9 // Add aux function result + ror w19, w19, #20 // Rotate left s=12 bits + eor x9, x6, x4 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z) + add w17, w6, w19 // Add X parameter round 1 D=FF(D, A, B, C, 0x8b44f7af, s=12, M[9]) + and x9, x9, x17 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z) + eor x9, x9, x4 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z) + movz x11, #0x5bb1 // .Load lower half of constant 0xffff5bb1 + movk x11, #0xffff, lsl #16 // .Load upper half of constant 0xffff5bb1 + add w8, w8, w16 // Add dest value + add w8, w8, w11 // Add constant 0xffff5bb1 + add w8, w8, w9 // Add aux function result + ror w8, w8, #15 // Rotate left s=17 bits + eor x9, x17, x6 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z) + add w8, w17, w8 // Add X parameter round 1 C=FF(C, D, A, B, 0xffff5bb1, s=17, M[10]) + and x9, x9, x8 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z) + eor x9, x9, x6 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z) + movz x11, #0xd7be // .Load lower half of constant 0x895cd7be + movk x11, #0x895c, lsl #16 // .Load upper half of constant 0x895cd7be + add w4, w4, w25 // Add dest value + add w4, w4, w11 // Add constant 0x895cd7be + add w9, w4, w9 // Add aux function result + ror w9, w9, #10 // Rotate left s=22 bits + eor x4, x8, x17 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z) + add w9, w8, w9 // Add X parameter round 1 B=FF(B, C, D, A, 0x895cd7be, s=22, M[11]) + ldp w11, w26, [x1, #48] // .Load 2 words of input data0 M[12],M[13] + ldp w12, w27, [x1, #56] // .Load 2 words of input data0 M[14],M[15] +#ifdef __AARCH64EB__ + rev w11, w11 + rev w26, w26 + rev w12, w12 + rev w27, w27 +#endif + and x4, x4, x9 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z) + eor x4, x4, x17 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z) + movz x19, #0x1122 // .Load lower half of constant 0x6b901122 + movk x19, #0x6b90, lsl #16 // .Load upper half of constant 0x6b901122 + add w6, w6, w11 // Add dest value + add w6, w6, w19 // Add constant 0x6b901122 + add w4, w6, w4 // Add aux function result + ror w4, w4, #25 // Rotate left s=7 bits + eor x6, x9, x8 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z) + add w4, w9, w4 // Add X parameter round 1 A=FF(A, B, C, D, 0x6b901122, s=7, M[12]) + and x6, x6, x4 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z) + eor x6, x6, x8 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z) + movz x19, #0x7193 // .Load lower half of constant 0xfd987193 + movk x19, #0xfd98, lsl #16 // .Load upper half of constant 0xfd987193 + add w17, w17, w26 // Add dest value + add w17, w17, w19 // Add constant 0xfd987193 + add w17, w17, w6 // Add aux function result + ror w17, w17, #20 // Rotate left s=12 bits + eor x6, x4, x9 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z) + add w17, w4, w17 // Add X parameter round 1 D=FF(D, A, B, C, 0xfd987193, s=12, M[13]) + and x6, x6, x17 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z) + eor x6, x6, x9 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z) + movz x13, #0x438e // .Load lower half of constant 0xa679438e + movk x13, #0xa679, lsl #16 // .Load upper half of constant 0xa679438e + add w8, w8, w12 // Add dest value + add w8, w8, w13 // Add constant 0xa679438e + add w8, w8, w6 // Add aux function result + ror w8, w8, #15 // Rotate left s=17 bits + eor x6, x17, x4 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z) + add w8, w17, w8 // Add X parameter round 1 C=FF(C, D, A, B, 0xa679438e, s=17, M[14]) + and x6, x6, x8 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z) + eor x6, x6, x4 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z) + movz x13, #0x821 // .Load lower half of constant 0x49b40821 + movk x13, #0x49b4, lsl #16 // .Load upper half of constant 0x49b40821 + add w9, w9, w27 // Add dest value + add w9, w9, w13 // Add constant 0x49b40821 + add w9, w9, w6 // Add aux function result + ror w9, w9, #10 // Rotate left s=22 bits + bic x6, x8, x17 // Aux function round 2 (~z & y) + add w9, w8, w9 // Add X parameter round 1 B=FF(B, C, D, A, 0x49b40821, s=22, M[15]) + movz x13, #0x2562 // .Load lower half of constant 0xf61e2562 + movk x13, #0xf61e, lsl #16 // .Load upper half of constant 0xf61e2562 + add w4, w4, w20 // Add dest value + add w4, w4, w13 // Add constant 0xf61e2562 + and x13, x9, x17 // Aux function round 2 (x & z) + add w4, w4, w6 // Add (~z & y) + add w4, w4, w13 // Add (x & z) + ror w4, w4, #27 // Rotate left s=5 bits + bic x6, x9, x8 // Aux function round 2 (~z & y) + add w4, w9, w4 // Add X parameter round 2 A=GG(A, B, C, D, 0xf61e2562, s=5, M[1]) + movz x13, #0xb340 // .Load lower half of constant 0xc040b340 + movk x13, #0xc040, lsl #16 // .Load upper half of constant 0xc040b340 + add w17, w17, w7 // Add dest value + add w17, w17, w13 // Add constant 0xc040b340 + and x13, x4, x8 // Aux function round 2 (x & z) + add w17, w17, w6 // Add (~z & y) + add w17, w17, w13 // Add (x & z) + ror w17, w17, #23 // Rotate left s=9 bits + bic x6, x4, x9 // Aux function round 2 (~z & y) + add w17, w4, w17 // Add X parameter round 2 D=GG(D, A, B, C, 0xc040b340, s=9, M[6]) + movz x13, #0x5a51 // .Load lower half of constant 0x265e5a51 + movk x13, #0x265e, lsl #16 // .Load upper half of constant 0x265e5a51 + add w8, w8, w25 // Add dest value + add w8, w8, w13 // Add constant 0x265e5a51 + and x13, x17, x9 // Aux function round 2 (x & z) + add w8, w8, w6 // Add (~z & y) + add w8, w8, w13 // Add (x & z) + ror w8, w8, #18 // Rotate left s=14 bits + bic x6, x17, x4 // Aux function round 2 (~z & y) + add w8, w17, w8 // Add X parameter round 2 C=GG(C, D, A, B, 0x265e5a51, s=14, M[11]) + movz x13, #0xc7aa // .Load lower half of constant 0xe9b6c7aa + movk x13, #0xe9b6, lsl #16 // .Load upper half of constant 0xe9b6c7aa + add w9, w9, w15 // Add dest value + add w9, w9, w13 // Add constant 0xe9b6c7aa + and x13, x8, x4 // Aux function round 2 (x & z) + add w9, w9, w6 // Add (~z & y) + add w9, w9, w13 // Add (x & z) + ror w9, w9, #12 // Rotate left s=20 bits + bic x6, x8, x17 // Aux function round 2 (~z & y) + add w9, w8, w9 // Add X parameter round 2 B=GG(B, C, D, A, 0xe9b6c7aa, s=20, M[0]) + movz x13, #0x105d // .Load lower half of constant 0xd62f105d + movk x13, #0xd62f, lsl #16 // .Load upper half of constant 0xd62f105d + add w4, w4, w22 // Add dest value + add w4, w4, w13 // Add constant 0xd62f105d + and x13, x9, x17 // Aux function round 2 (x & z) + add w4, w4, w6 // Add (~z & y) + add w4, w4, w13 // Add (x & z) + ror w4, w4, #27 // Rotate left s=5 bits + bic x6, x9, x8 // Aux function round 2 (~z & y) + add w4, w9, w4 // Add X parameter round 2 A=GG(A, B, C, D, 0xd62f105d, s=5, M[5]) + movz x13, #0x1453 // .Load lower half of constant 0x2441453 + movk x13, #0x244, lsl #16 // .Load upper half of constant 0x2441453 + add w17, w17, w16 // Add dest value + add w17, w17, w13 // Add constant 0x2441453 + and x13, x4, x8 // Aux function round 2 (x & z) + add w17, w17, w6 // Add (~z & y) + add w17, w17, w13 // Add (x & z) + ror w17, w17, #23 // Rotate left s=9 bits + bic x6, x4, x9 // Aux function round 2 (~z & y) + add w17, w4, w17 // Add X parameter round 2 D=GG(D, A, B, C, 0x2441453, s=9, M[10]) + movz x13, #0xe681 // .Load lower half of constant 0xd8a1e681 + movk x13, #0xd8a1, lsl #16 // .Load upper half of constant 0xd8a1e681 + add w8, w8, w27 // Add dest value + add w8, w8, w13 // Add constant 0xd8a1e681 + and x13, x17, x9 // Aux function round 2 (x & z) + add w8, w8, w6 // Add (~z & y) + add w8, w8, w13 // Add (x & z) + ror w8, w8, #18 // Rotate left s=14 bits + bic x6, x17, x4 // Aux function round 2 (~z & y) + add w8, w17, w8 // Add X parameter round 2 C=GG(C, D, A, B, 0xd8a1e681, s=14, M[15]) + movz x13, #0xfbc8 // .Load lower half of constant 0xe7d3fbc8 + movk x13, #0xe7d3, lsl #16 // .Load upper half of constant 0xe7d3fbc8 + add w9, w9, w14 // Add dest value + add w9, w9, w13 // Add constant 0xe7d3fbc8 + and x13, x8, x4 // Aux function round 2 (x & z) + add w9, w9, w6 // Add (~z & y) + add w9, w9, w13 // Add (x & z) + ror w9, w9, #12 // Rotate left s=20 bits + bic x6, x8, x17 // Aux function round 2 (~z & y) + add w9, w8, w9 // Add X parameter round 2 B=GG(B, C, D, A, 0xe7d3fbc8, s=20, M[4]) + movz x13, #0xcde6 // .Load lower half of constant 0x21e1cde6 + movk x13, #0x21e1, lsl #16 // .Load upper half of constant 0x21e1cde6 + add w4, w4, w24 // Add dest value + add w4, w4, w13 // Add constant 0x21e1cde6 + and x13, x9, x17 // Aux function round 2 (x & z) + add w4, w4, w6 // Add (~z & y) + add w4, w4, w13 // Add (x & z) + ror w4, w4, #27 // Rotate left s=5 bits + bic x6, x9, x8 // Aux function round 2 (~z & y) + add w4, w9, w4 // Add X parameter round 2 A=GG(A, B, C, D, 0x21e1cde6, s=5, M[9]) + movz x13, #0x7d6 // .Load lower half of constant 0xc33707d6 + movk x13, #0xc337, lsl #16 // .Load upper half of constant 0xc33707d6 + add w17, w17, w12 // Add dest value + add w17, w17, w13 // Add constant 0xc33707d6 + and x13, x4, x8 // Aux function round 2 (x & z) + add w17, w17, w6 // Add (~z & y) + add w17, w17, w13 // Add (x & z) + ror w17, w17, #23 // Rotate left s=9 bits + bic x6, x4, x9 // Aux function round 2 (~z & y) + add w17, w4, w17 // Add X parameter round 2 D=GG(D, A, B, C, 0xc33707d6, s=9, M[14]) + movz x13, #0xd87 // .Load lower half of constant 0xf4d50d87 + movk x13, #0xf4d5, lsl #16 // .Load upper half of constant 0xf4d50d87 + add w8, w8, w21 // Add dest value + add w8, w8, w13 // Add constant 0xf4d50d87 + and x13, x17, x9 // Aux function round 2 (x & z) + add w8, w8, w6 // Add (~z & y) + add w8, w8, w13 // Add (x & z) + ror w8, w8, #18 // Rotate left s=14 bits + bic x6, x17, x4 // Aux function round 2 (~z & y) + add w8, w17, w8 // Add X parameter round 2 C=GG(C, D, A, B, 0xf4d50d87, s=14, M[3]) + movz x13, #0x14ed // .Load lower half of constant 0x455a14ed + movk x13, #0x455a, lsl #16 // .Load upper half of constant 0x455a14ed + add w9, w9, w5 // Add dest value + add w9, w9, w13 // Add constant 0x455a14ed + and x13, x8, x4 // Aux function round 2 (x & z) + add w9, w9, w6 // Add (~z & y) + add w9, w9, w13 // Add (x & z) + ror w9, w9, #12 // Rotate left s=20 bits + bic x6, x8, x17 // Aux function round 2 (~z & y) + add w9, w8, w9 // Add X parameter round 2 B=GG(B, C, D, A, 0x455a14ed, s=20, M[8]) + movz x13, #0xe905 // .Load lower half of constant 0xa9e3e905 + movk x13, #0xa9e3, lsl #16 // .Load upper half of constant 0xa9e3e905 + add w4, w4, w26 // Add dest value + add w4, w4, w13 // Add constant 0xa9e3e905 + and x13, x9, x17 // Aux function round 2 (x & z) + add w4, w4, w6 // Add (~z & y) + add w4, w4, w13 // Add (x & z) + ror w4, w4, #27 // Rotate left s=5 bits + bic x6, x9, x8 // Aux function round 2 (~z & y) + add w4, w9, w4 // Add X parameter round 2 A=GG(A, B, C, D, 0xa9e3e905, s=5, M[13]) + movz x13, #0xa3f8 // .Load lower half of constant 0xfcefa3f8 + movk x13, #0xfcef, lsl #16 // .Load upper half of constant 0xfcefa3f8 + add w17, w17, w3 // Add dest value + add w17, w17, w13 // Add constant 0xfcefa3f8 + and x13, x4, x8 // Aux function round 2 (x & z) + add w17, w17, w6 // Add (~z & y) + add w17, w17, w13 // Add (x & z) + ror w17, w17, #23 // Rotate left s=9 bits + bic x6, x4, x9 // Aux function round 2 (~z & y) + add w17, w4, w17 // Add X parameter round 2 D=GG(D, A, B, C, 0xfcefa3f8, s=9, M[2]) + movz x13, #0x2d9 // .Load lower half of constant 0x676f02d9 + movk x13, #0x676f, lsl #16 // .Load upper half of constant 0x676f02d9 + add w8, w8, w23 // Add dest value + add w8, w8, w13 // Add constant 0x676f02d9 + and x13, x17, x9 // Aux function round 2 (x & z) + add w8, w8, w6 // Add (~z & y) + add w8, w8, w13 // Add (x & z) + ror w8, w8, #18 // Rotate left s=14 bits + bic x6, x17, x4 // Aux function round 2 (~z & y) + add w8, w17, w8 // Add X parameter round 2 C=GG(C, D, A, B, 0x676f02d9, s=14, M[7]) + movz x13, #0x4c8a // .Load lower half of constant 0x8d2a4c8a + movk x13, #0x8d2a, lsl #16 // .Load upper half of constant 0x8d2a4c8a + add w9, w9, w11 // Add dest value + add w9, w9, w13 // Add constant 0x8d2a4c8a + and x13, x8, x4 // Aux function round 2 (x & z) + add w9, w9, w6 // Add (~z & y) + add w9, w9, w13 // Add (x & z) + eor x6, x8, x17 // Begin aux function round 3 H(x,y,z)=(x^y^z) + ror w9, w9, #12 // Rotate left s=20 bits + movz x10, #0x3942 // .Load lower half of constant 0xfffa3942 + add w9, w8, w9 // Add X parameter round 2 B=GG(B, C, D, A, 0x8d2a4c8a, s=20, M[12]) + movk x10, #0xfffa, lsl #16 // .Load upper half of constant 0xfffa3942 + add w4, w4, w22 // Add dest value + eor x6, x6, x9 // End aux function round 3 H(x,y,z)=(x^y^z) + add w4, w4, w10 // Add constant 0xfffa3942 + add w4, w4, w6 // Add aux function result + ror w4, w4, #28 // Rotate left s=4 bits + eor x6, x9, x8 // Begin aux function round 3 H(x,y,z)=(x^y^z) + movz x10, #0xf681 // .Load lower half of constant 0x8771f681 + add w4, w9, w4 // Add X parameter round 3 A=HH(A, B, C, D, 0xfffa3942, s=4, M[5]) + movk x10, #0x8771, lsl #16 // .Load upper half of constant 0x8771f681 + add w17, w17, w5 // Add dest value + eor x6, x6, x4 // End aux function round 3 H(x,y,z)=(x^y^z) + add w17, w17, w10 // Add constant 0x8771f681 + add w17, w17, w6 // Add aux function result + eor x6, x4, x9 // Begin aux function round 3 H(x,y,z)=(x^y^z) + ror w17, w17, #21 // Rotate left s=11 bits + movz x13, #0x6122 // .Load lower half of constant 0x6d9d6122 + add w17, w4, w17 // Add X parameter round 3 D=HH(D, A, B, C, 0x8771f681, s=11, M[8]) + movk x13, #0x6d9d, lsl #16 // .Load upper half of constant 0x6d9d6122 + add w8, w8, w25 // Add dest value + eor x6, x6, x17 // End aux function round 3 H(x,y,z)=(x^y^z) + add w8, w8, w13 // Add constant 0x6d9d6122 + add w8, w8, w6 // Add aux function result + ror w8, w8, #16 // Rotate left s=16 bits + eor x6, x17, x4 // Begin aux function round 3 H(x,y,z)=(x^y^z) + movz x13, #0x380c // .Load lower half of constant 0xfde5380c + add w8, w17, w8 // Add X parameter round 3 C=HH(C, D, A, B, 0x6d9d6122, s=16, M[11]) + movk x13, #0xfde5, lsl #16 // .Load upper half of constant 0xfde5380c + add w9, w9, w12 // Add dest value + eor x6, x6, x8 // End aux function round 3 H(x,y,z)=(x^y^z) + add w9, w9, w13 // Add constant 0xfde5380c + add w9, w9, w6 // Add aux function result + eor x6, x8, x17 // Begin aux function round 3 H(x,y,z)=(x^y^z) + ror w9, w9, #9 // Rotate left s=23 bits + movz x10, #0xea44 // .Load lower half of constant 0xa4beea44 + add w9, w8, w9 // Add X parameter round 3 B=HH(B, C, D, A, 0xfde5380c, s=23, M[14]) + movk x10, #0xa4be, lsl #16 // .Load upper half of constant 0xa4beea44 + add w4, w4, w20 // Add dest value + eor x6, x6, x9 // End aux function round 3 H(x,y,z)=(x^y^z) + add w4, w4, w10 // Add constant 0xa4beea44 + add w4, w4, w6 // Add aux function result + ror w4, w4, #28 // Rotate left s=4 bits + eor x6, x9, x8 // Begin aux function round 3 H(x,y,z)=(x^y^z) + movz x10, #0xcfa9 // .Load lower half of constant 0x4bdecfa9 + add w4, w9, w4 // Add X parameter round 3 A=HH(A, B, C, D, 0xa4beea44, s=4, M[1]) + movk x10, #0x4bde, lsl #16 // .Load upper half of constant 0x4bdecfa9 + add w17, w17, w14 // Add dest value + eor x6, x6, x4 // End aux function round 3 H(x,y,z)=(x^y^z) + add w17, w17, w10 // Add constant 0x4bdecfa9 + add w17, w17, w6 // Add aux function result + eor x6, x4, x9 // Begin aux function round 3 H(x,y,z)=(x^y^z) + ror w17, w17, #21 // Rotate left s=11 bits + movz x13, #0x4b60 // .Load lower half of constant 0xf6bb4b60 + add w17, w4, w17 // Add X parameter round 3 D=HH(D, A, B, C, 0x4bdecfa9, s=11, M[4]) + movk x13, #0xf6bb, lsl #16 // .Load upper half of constant 0xf6bb4b60 + add w8, w8, w23 // Add dest value + eor x6, x6, x17 // End aux function round 3 H(x,y,z)=(x^y^z) + add w8, w8, w13 // Add constant 0xf6bb4b60 + add w8, w8, w6 // Add aux function result + ror w8, w8, #16 // Rotate left s=16 bits + eor x6, x17, x4 // Begin aux function round 3 H(x,y,z)=(x^y^z) + movz x13, #0xbc70 // .Load lower half of constant 0xbebfbc70 + add w8, w17, w8 // Add X parameter round 3 C=HH(C, D, A, B, 0xf6bb4b60, s=16, M[7]) + movk x13, #0xbebf, lsl #16 // .Load upper half of constant 0xbebfbc70 + add w9, w9, w16 // Add dest value + eor x6, x6, x8 // End aux function round 3 H(x,y,z)=(x^y^z) + add w9, w9, w13 // Add constant 0xbebfbc70 + add w9, w9, w6 // Add aux function result + eor x6, x8, x17 // Begin aux function round 3 H(x,y,z)=(x^y^z) + ror w9, w9, #9 // Rotate left s=23 bits + movz x10, #0x7ec6 // .Load lower half of constant 0x289b7ec6 + add w9, w8, w9 // Add X parameter round 3 B=HH(B, C, D, A, 0xbebfbc70, s=23, M[10]) + movk x10, #0x289b, lsl #16 // .Load upper half of constant 0x289b7ec6 + add w4, w4, w26 // Add dest value + eor x6, x6, x9 // End aux function round 3 H(x,y,z)=(x^y^z) + add w4, w4, w10 // Add constant 0x289b7ec6 + add w4, w4, w6 // Add aux function result + ror w4, w4, #28 // Rotate left s=4 bits + eor x6, x9, x8 // Begin aux function round 3 H(x,y,z)=(x^y^z) + movz x10, #0x27fa // .Load lower half of constant 0xeaa127fa + add w4, w9, w4 // Add X parameter round 3 A=HH(A, B, C, D, 0x289b7ec6, s=4, M[13]) + movk x10, #0xeaa1, lsl #16 // .Load upper half of constant 0xeaa127fa + add w17, w17, w15 // Add dest value + eor x6, x6, x4 // End aux function round 3 H(x,y,z)=(x^y^z) + add w17, w17, w10 // Add constant 0xeaa127fa + add w17, w17, w6 // Add aux function result + eor x6, x4, x9 // Begin aux function round 3 H(x,y,z)=(x^y^z) + ror w17, w17, #21 // Rotate left s=11 bits + movz x13, #0x3085 // .Load lower half of constant 0xd4ef3085 + add w17, w4, w17 // Add X parameter round 3 D=HH(D, A, B, C, 0xeaa127fa, s=11, M[0]) + movk x13, #0xd4ef, lsl #16 // .Load upper half of constant 0xd4ef3085 + add w8, w8, w21 // Add dest value + eor x6, x6, x17 // End aux function round 3 H(x,y,z)=(x^y^z) + add w8, w8, w13 // Add constant 0xd4ef3085 + add w8, w8, w6 // Add aux function result + ror w8, w8, #16 // Rotate left s=16 bits + eor x6, x17, x4 // Begin aux function round 3 H(x,y,z)=(x^y^z) + movz x13, #0x1d05 // .Load lower half of constant 0x4881d05 + add w8, w17, w8 // Add X parameter round 3 C=HH(C, D, A, B, 0xd4ef3085, s=16, M[3]) + movk x13, #0x488, lsl #16 // .Load upper half of constant 0x4881d05 + add w9, w9, w7 // Add dest value + eor x6, x6, x8 // End aux function round 3 H(x,y,z)=(x^y^z) + add w9, w9, w13 // Add constant 0x4881d05 + add w9, w9, w6 // Add aux function result + eor x6, x8, x17 // Begin aux function round 3 H(x,y,z)=(x^y^z) + ror w9, w9, #9 // Rotate left s=23 bits + movz x10, #0xd039 // .Load lower half of constant 0xd9d4d039 + add w9, w8, w9 // Add X parameter round 3 B=HH(B, C, D, A, 0x4881d05, s=23, M[6]) + movk x10, #0xd9d4, lsl #16 // .Load upper half of constant 0xd9d4d039 + add w4, w4, w24 // Add dest value + eor x6, x6, x9 // End aux function round 3 H(x,y,z)=(x^y^z) + add w4, w4, w10 // Add constant 0xd9d4d039 + add w4, w4, w6 // Add aux function result + ror w4, w4, #28 // Rotate left s=4 bits + eor x6, x9, x8 // Begin aux function round 3 H(x,y,z)=(x^y^z) + movz x10, #0x99e5 // .Load lower half of constant 0xe6db99e5 + add w4, w9, w4 // Add X parameter round 3 A=HH(A, B, C, D, 0xd9d4d039, s=4, M[9]) + movk x10, #0xe6db, lsl #16 // .Load upper half of constant 0xe6db99e5 + add w17, w17, w11 // Add dest value + eor x6, x6, x4 // End aux function round 3 H(x,y,z)=(x^y^z) + add w17, w17, w10 // Add constant 0xe6db99e5 + add w17, w17, w6 // Add aux function result + eor x6, x4, x9 // Begin aux function round 3 H(x,y,z)=(x^y^z) + ror w17, w17, #21 // Rotate left s=11 bits + movz x13, #0x7cf8 // .Load lower half of constant 0x1fa27cf8 + add w17, w4, w17 // Add X parameter round 3 D=HH(D, A, B, C, 0xe6db99e5, s=11, M[12]) + movk x13, #0x1fa2, lsl #16 // .Load upper half of constant 0x1fa27cf8 + add w8, w8, w27 // Add dest value + eor x6, x6, x17 // End aux function round 3 H(x,y,z)=(x^y^z) + add w8, w8, w13 // Add constant 0x1fa27cf8 + add w8, w8, w6 // Add aux function result + ror w8, w8, #16 // Rotate left s=16 bits + eor x6, x17, x4 // Begin aux function round 3 H(x,y,z)=(x^y^z) + movz x13, #0x5665 // .Load lower half of constant 0xc4ac5665 + add w8, w17, w8 // Add X parameter round 3 C=HH(C, D, A, B, 0x1fa27cf8, s=16, M[15]) + movk x13, #0xc4ac, lsl #16 // .Load upper half of constant 0xc4ac5665 + add w9, w9, w3 // Add dest value + eor x6, x6, x8 // End aux function round 3 H(x,y,z)=(x^y^z) + add w9, w9, w13 // Add constant 0xc4ac5665 + add w9, w9, w6 // Add aux function result + ror w9, w9, #9 // Rotate left s=23 bits + movz x6, #0x2244 // .Load lower half of constant 0xf4292244 + movk x6, #0xf429, lsl #16 // .Load upper half of constant 0xf4292244 + add w9, w8, w9 // Add X parameter round 3 B=HH(B, C, D, A, 0xc4ac5665, s=23, M[2]) + add w4, w4, w15 // Add dest value + orn x13, x9, x17 // Begin aux function round 4 I(x,y,z)=((~z|x)^y) + add w4, w4, w6 // Add constant 0xf4292244 + eor x6, x8, x13 // End aux function round 4 I(x,y,z)=((~z|x)^y) + add w4, w4, w6 // Add aux function result + ror w4, w4, #26 // Rotate left s=6 bits + movz x6, #0xff97 // .Load lower half of constant 0x432aff97 + movk x6, #0x432a, lsl #16 // .Load upper half of constant 0x432aff97 + add w4, w9, w4 // Add X parameter round 4 A=II(A, B, C, D, 0xf4292244, s=6, M[0]) + orn x10, x4, x8 // Begin aux function round 4 I(x,y,z)=((~z|x)^y) + add w17, w17, w23 // Add dest value + eor x10, x9, x10 // End aux function round 4 I(x,y,z)=((~z|x)^y) + add w17, w17, w6 // Add constant 0x432aff97 + add w6, w17, w10 // Add aux function result + ror w6, w6, #22 // Rotate left s=10 bits + movz x17, #0x23a7 // .Load lower half of constant 0xab9423a7 + movk x17, #0xab94, lsl #16 // .Load upper half of constant 0xab9423a7 + add w6, w4, w6 // Add X parameter round 4 D=II(D, A, B, C, 0x432aff97, s=10, M[7]) + add w8, w8, w12 // Add dest value + orn x10, x6, x9 // Begin aux function round 4 I(x,y,z)=((~z|x)^y) + add w8, w8, w17 // Add constant 0xab9423a7 + eor x17, x4, x10 // End aux function round 4 I(x,y,z)=((~z|x)^y) + add w8, w8, w17 // Add aux function result + ror w8, w8, #17 // Rotate left s=15 bits + movz x17, #0xa039 // .Load lower half of constant 0xfc93a039 + movk x17, #0xfc93, lsl #16 // .Load upper half of constant 0xfc93a039 + add w8, w6, w8 // Add X parameter round 4 C=II(C, D, A, B, 0xab9423a7, s=15, M[14]) + orn x13, x8, x4 // Begin aux function round 4 I(x,y,z)=((~z|x)^y) + add w9, w9, w22 // Add dest value + eor x13, x6, x13 // End aux function round 4 I(x,y,z)=((~z|x)^y) + add w9, w9, w17 // Add constant 0xfc93a039 + add w17, w9, w13 // Add aux function result + ror w17, w17, #11 // Rotate left s=21 bits + movz x9, #0x59c3 // .Load lower half of constant 0x655b59c3 + movk x9, #0x655b, lsl #16 // .Load upper half of constant 0x655b59c3 + add w17, w8, w17 // Add X parameter round 4 B=II(B, C, D, A, 0xfc93a039, s=21, M[5]) + add w4, w4, w11 // Add dest value + orn x13, x17, x6 // Begin aux function round 4 I(x,y,z)=((~z|x)^y) + add w9, w4, w9 // Add constant 0x655b59c3 + eor x4, x8, x13 // End aux function round 4 I(x,y,z)=((~z|x)^y) + add w9, w9, w4 // Add aux function result + ror w9, w9, #26 // Rotate left s=6 bits + movz x4, #0xcc92 // .Load lower half of constant 0x8f0ccc92 + movk x4, #0x8f0c, lsl #16 // .Load upper half of constant 0x8f0ccc92 + add w9, w17, w9 // Add X parameter round 4 A=II(A, B, C, D, 0x655b59c3, s=6, M[12]) + orn x10, x9, x8 // Begin aux function round 4 I(x,y,z)=((~z|x)^y) + add w6, w6, w21 // Add dest value + eor x10, x17, x10 // End aux function round 4 I(x,y,z)=((~z|x)^y) + add w4, w6, w4 // Add constant 0x8f0ccc92 + add w6, w4, w10 // Add aux function result + ror w6, w6, #22 // Rotate left s=10 bits + movz x4, #0xf47d // .Load lower half of constant 0xffeff47d + movk x4, #0xffef, lsl #16 // .Load upper half of constant 0xffeff47d + add w6, w9, w6 // Add X parameter round 4 D=II(D, A, B, C, 0x8f0ccc92, s=10, M[3]) + add w8, w8, w16 // Add dest value + orn x10, x6, x17 // Begin aux function round 4 I(x,y,z)=((~z|x)^y) + add w8, w8, w4 // Add constant 0xffeff47d + eor x4, x9, x10 // End aux function round 4 I(x,y,z)=((~z|x)^y) + add w8, w8, w4 // Add aux function result + ror w8, w8, #17 // Rotate left s=15 bits + movz x4, #0x5dd1 // .Load lower half of constant 0x85845dd1 + movk x4, #0x8584, lsl #16 // .Load upper half of constant 0x85845dd1 + add w8, w6, w8 // Add X parameter round 4 C=II(C, D, A, B, 0xffeff47d, s=15, M[10]) + orn x10, x8, x9 // Begin aux function round 4 I(x,y,z)=((~z|x)^y) + add w15, w17, w20 // Add dest value + eor x17, x6, x10 // End aux function round 4 I(x,y,z)=((~z|x)^y) + add w15, w15, w4 // Add constant 0x85845dd1 + add w4, w15, w17 // Add aux function result + ror w4, w4, #11 // Rotate left s=21 bits + movz x15, #0x7e4f // .Load lower half of constant 0x6fa87e4f + movk x15, #0x6fa8, lsl #16 // .Load upper half of constant 0x6fa87e4f + add w17, w8, w4 // Add X parameter round 4 B=II(B, C, D, A, 0x85845dd1, s=21, M[1]) + add w4, w9, w5 // Add dest value + orn x9, x17, x6 // Begin aux function round 4 I(x,y,z)=((~z|x)^y) + add w15, w4, w15 // Add constant 0x6fa87e4f + eor x4, x8, x9 // End aux function round 4 I(x,y,z)=((~z|x)^y) + add w9, w15, w4 // Add aux function result + ror w9, w9, #26 // Rotate left s=6 bits + movz x15, #0xe6e0 // .Load lower half of constant 0xfe2ce6e0 + movk x15, #0xfe2c, lsl #16 // .Load upper half of constant 0xfe2ce6e0 + add w4, w17, w9 // Add X parameter round 4 A=II(A, B, C, D, 0x6fa87e4f, s=6, M[8]) + orn x9, x4, x8 // Begin aux function round 4 I(x,y,z)=((~z|x)^y) + add w6, w6, w27 // Add dest value + eor x9, x17, x9 // End aux function round 4 I(x,y,z)=((~z|x)^y) + add w15, w6, w15 // Add constant 0xfe2ce6e0 + add w6, w15, w9 // Add aux function result + ror w6, w6, #22 // Rotate left s=10 bits + movz x9, #0x4314 // .Load lower half of constant 0xa3014314 + movk x9, #0xa301, lsl #16 // .Load upper half of constant 0xa3014314 + add w15, w4, w6 // Add X parameter round 4 D=II(D, A, B, C, 0xfe2ce6e0, s=10, M[15]) + add w6, w8, w7 // Add dest value + orn x7, x15, x17 // Begin aux function round 4 I(x,y,z)=((~z|x)^y) + add w8, w6, w9 // Add constant 0xa3014314 + eor x9, x4, x7 // End aux function round 4 I(x,y,z)=((~z|x)^y) + add w6, w8, w9 // Add aux function result + ror w6, w6, #17 // Rotate left s=15 bits + movz x7, #0x11a1 // .Load lower half of constant 0x4e0811a1 + movk x7, #0x4e08, lsl #16 // .Load upper half of constant 0x4e0811a1 + add w8, w15, w6 // Add X parameter round 4 C=II(C, D, A, B, 0xa3014314, s=15, M[6]) + orn x9, x8, x4 // Begin aux function round 4 I(x,y,z)=((~z|x)^y) + add w6, w17, w26 // Add dest value + eor x17, x15, x9 // End aux function round 4 I(x,y,z)=((~z|x)^y) + add w9, w6, w7 // Add constant 0x4e0811a1 + add w7, w9, w17 // Add aux function result + ror w7, w7, #11 // Rotate left s=21 bits + movz x6, #0x7e82 // .Load lower half of constant 0xf7537e82 + movk x6, #0xf753, lsl #16 // .Load upper half of constant 0xf7537e82 + add w9, w8, w7 // Add X parameter round 4 B=II(B, C, D, A, 0x4e0811a1, s=21, M[13]) + add w17, w4, w14 // Add dest value + orn x7, x9, x15 // Begin aux function round 4 I(x,y,z)=((~z|x)^y) + add w14, w17, w6 // Add constant 0xf7537e82 + eor x4, x8, x7 // End aux function round 4 I(x,y,z)=((~z|x)^y) + add w17, w14, w4 // Add aux function result + ror w17, w17, #26 // Rotate left s=6 bits + movz x6, #0xf235 // .Load lower half of constant 0xbd3af235 + movk x6, #0xbd3a, lsl #16 // .Load upper half of constant 0xbd3af235 + add w7, w9, w17 // Add X parameter round 4 A=II(A, B, C, D, 0xf7537e82, s=6, M[4]) + orn x14, x7, x8 // Begin aux function round 4 I(x,y,z)=((~z|x)^y) + add w4, w15, w25 // Add dest value + eor x17, x9, x14 // End aux function round 4 I(x,y,z)=((~z|x)^y) + add w15, w4, w6 // Add constant 0xbd3af235 + add w16, w15, w17 // Add aux function result + ror w16, w16, #22 // Rotate left s=10 bits + movz x14, #0xd2bb // .Load lower half of constant 0x2ad7d2bb + movk x14, #0x2ad7, lsl #16 // .Load upper half of constant 0x2ad7d2bb + add w4, w7, w16 // Add X parameter round 4 D=II(D, A, B, C, 0xbd3af235, s=10, M[11]) + add w6, w8, w3 // Add dest value + orn x15, x4, x9 // Begin aux function round 4 I(x,y,z)=((~z|x)^y) + add w17, w6, w14 // Add constant 0x2ad7d2bb + eor x16, x7, x15 // End aux function round 4 I(x,y,z)=((~z|x)^y) + add w8, w17, w16 // Add aux function result + ror w8, w8, #17 // Rotate left s=15 bits + movz x3, #0xd391 // .Load lower half of constant 0xeb86d391 + movk x3, #0xeb86, lsl #16 // .Load upper half of constant 0xeb86d391 + add w14, w4, w8 // Add X parameter round 4 C=II(C, D, A, B, 0x2ad7d2bb, s=15, M[2]) + orn x6, x14, x7 // Begin aux function round 4 I(x,y,z)=((~z|x)^y) + add w15, w9, w24 // Add dest value + eor x17, x4, x6 // End aux function round 4 I(x,y,z)=((~z|x)^y) + add w16, w15, w3 // Add constant 0xeb86d391 + add w8, w16, w17 // Add aux function result + ror w8, w8, #11 // Rotate left s=21 bits + ldp w6, w15, [x0] // Reload MD5 state->A and state->B + ldp w5, w9, [x0, #8] // Reload MD5 state->C and state->D + add w3, w14, w8 // Add X parameter round 4 B=II(B, C, D, A, 0xeb86d391, s=21, M[9]) + add w13, w4, w9 // Add result of MD5 rounds to state->D + add w12, w14, w5 // Add result of MD5 rounds to state->C + add w10, w7, w6 // Add result of MD5 rounds to state->A + add w11, w3, w15 // Add result of MD5 rounds to state->B + stp w12, w13, [x0, #8] // Store MD5 states C,D + stp w10, w11, [x0] // Store MD5 states A,B + add x1, x1, #64 // Increment data pointer + subs w2, w2, #1 // Decrement block counter + b.ne ossl_md5_blocks_loop + + ldp x21,x22,[sp,#16] + ldp x23,x24,[sp,#32] + ldp x25,x26,[sp,#48] + ldp x27,x28,[sp,#64] + ldp x19,x20,[sp],#80 + ret + diff --git a/sys/crypto/openssl/aarch64/poly1305-armv8.S b/sys/crypto/openssl/aarch64/poly1305-armv8.S index 8925984c3ee0..3e0ccf7ff0d2 100644 --- a/sys/crypto/openssl/aarch64/poly1305-armv8.S +++ b/sys/crypto/openssl/aarch64/poly1305-armv8.S @@ -41,10 +41,14 @@ poly1305_init: tst w17,#ARMV7_NEON - adr x12,.Lpoly1305_blocks - adr x7,.Lpoly1305_blocks_neon - adr x13,.Lpoly1305_emit - adr x8,.Lpoly1305_emit_neon + adrp x12,poly1305_blocks + add x12,x12,#:lo12:.Lpoly1305_blocks + adrp x7,poly1305_blocks_neon + add x7,x7,#:lo12:.Lpoly1305_blocks_neon + adrp x13,poly1305_emit + add x13,x13,#:lo12:.Lpoly1305_emit + adrp x8,poly1305_emit_neon + add x8,x8,#:lo12:.Lpoly1305_emit_neon csel x12,x12,x7,eq csel x13,x13,x8,eq @@ -374,7 +378,8 @@ poly1305_blocks_neon: ldr x30,[sp,#8] add x16,x1,#32 - adr x17,.Lzeros + adrp x17,.Lzeros + add x17,x17,#:lo12:.Lzeros subs x2,x2,#64 csel x16,x17,x16,lo @@ -386,7 +391,8 @@ poly1305_blocks_neon: .align 4 .Leven_neon: add x16,x1,#32 - adr x17,.Lzeros + adrp x17,.Lzeros + add x17,x17,#:lo12:.Lzeros subs x2,x2,#64 csel x16,x17,x16,lo @@ -869,6 +875,8 @@ poly1305_emit_neon: ret .size poly1305_emit_neon,.-poly1305_emit_neon +.section .rodata + .align 5 .Lzeros: .long 0,0,0,0,0,0,0,0 diff --git a/sys/crypto/openssl/aarch64/sha1-armv8.S b/sys/crypto/openssl/aarch64/sha1-armv8.S index 9e2d86072394..31627ee375a9 100644 --- a/sys/crypto/openssl/aarch64/sha1-armv8.S +++ b/sys/crypto/openssl/aarch64/sha1-armv8.S @@ -1081,7 +1081,8 @@ sha1_block_armv8: stp x29,x30,[sp,#-16]! add x29,sp,#0 - adr x4,.Lconst + adrp x4,.Lconst + add x4,x4,#:lo12:.Lconst eor v1.16b,v1.16b,v1.16b ld1 {v0.4s},[x0],#16 ld1 {v1.s}[0],[x0] @@ -1204,6 +1205,9 @@ sha1_block_armv8: ldr x29,[sp],#16 ret .size sha1_block_armv8,.-sha1_block_armv8 + +.section .rodata + .align 6 .Lconst: .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 //K_00_19 diff --git a/sys/crypto/openssl/aarch64/sha256-armv8.S b/sys/crypto/openssl/aarch64/sha256-armv8.S index 4f3934a4890c..0b26b4d0e92a 100644 --- a/sys/crypto/openssl/aarch64/sha256-armv8.S +++ b/sys/crypto/openssl/aarch64/sha256-armv8.S @@ -1,5 +1,5 @@ /* Do not modify. This file is auto-generated from sha512-armv8.pl. */ -// Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved. +// Copyright 2014-2025 The OpenSSL Project Authors. All Rights Reserved. // // Licensed under the Apache License 2.0 (the "License"). You may not use // this file except in compliance with the License. You can obtain a copy @@ -93,7 +93,8 @@ sha256_block_data_order: ldp w24,w25,[x0,#4*4] add x2,x1,x2,lsl#6 // end of input ldp w26,w27,[x0,#6*4] - adr x30,.LK256 + adrp x30,.LK256 + add x30,x30,#:lo12:.LK256 stp x0,x2,[x29,#96] .Loop: @@ -1041,6 +1042,8 @@ sha256_block_data_order: ret .size sha256_block_data_order,.-sha256_block_data_order +.section .rodata + .align 6 .type .LK256,%object .LK256: @@ -1065,6 +1068,8 @@ sha256_block_data_order: .byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 2 .align 2 + +.text #ifndef __KERNEL__ .type sha256_block_armv8,%function .align 6 @@ -1075,7 +1080,8 @@ sha256_block_armv8: add x29,sp,#0 ld1 {v0.4s,v1.4s},[x0] - adr x3,.LK256 + adrp x3,.LK256 + add x3,x3,#:lo12:.LK256 .Loop_hw: ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 @@ -1219,7 +1225,8 @@ sha256_block_neon: mov x29, sp sub sp,sp,#16*4 - adr x16,.LK256 + adrp x16,.LK256 + add x16,x16,#:lo12:.LK256 add x2,x1,x2,lsl#6 // len to point at the end of inp ld1 {v0.16b},[x1], #16 diff --git a/sys/crypto/openssl/aarch64/sha512-armv8.S b/sys/crypto/openssl/aarch64/sha512-armv8.S index c119d9cf5c95..d88d310020dc 100644 --- a/sys/crypto/openssl/aarch64/sha512-armv8.S +++ b/sys/crypto/openssl/aarch64/sha512-armv8.S @@ -1,5 +1,5 @@ /* Do not modify. This file is auto-generated from sha512-armv8.pl. */ -// Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved. +// Copyright 2014-2025 The OpenSSL Project Authors. All Rights Reserved. // // Licensed under the Apache License 2.0 (the "License"). You may not use // this file except in compliance with the License. You can obtain a copy @@ -91,7 +91,8 @@ sha512_block_data_order: ldp x24,x25,[x0,#4*8] add x2,x1,x2,lsl#7 // end of input ldp x26,x27,[x0,#6*8] - adr x30,.LK512 + adrp x30,.LK512 + add x30,x30,#:lo12:.LK512 stp x0,x2,[x29,#96] .Loop: @@ -1039,6 +1040,8 @@ sha512_block_data_order: ret .size sha512_block_data_order,.-sha512_block_data_order +.section .rodata + .align 6 .type .LK512,%object .LK512: @@ -1087,6 +1090,8 @@ sha512_block_data_order: .byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 2 .align 2 + +.text #ifndef __KERNEL__ .type sha512_block_armv8,%function .align 6 @@ -1100,7 +1105,8 @@ sha512_block_armv8: ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 ld1 {v0.2d,v1.2d,v2.2d,v3.2d},[x0] // load context - adr x3,.LK512 + adrp x3,.LK512 + add x3,x3,#:lo12:.LK512 rev64 v16.16b,v16.16b rev64 v17.16b,v17.16b diff --git a/sys/crypto/openssl/aarch64/sm3-armv8.S b/sys/crypto/openssl/aarch64/sm3-armv8.S new file mode 100644 index 000000000000..08785cae9e16 --- /dev/null +++ b/sys/crypto/openssl/aarch64/sm3-armv8.S @@ -0,0 +1,509 @@ +/* Do not modify. This file is auto-generated from sm3-armv8.pl. */ +// Copyright 2021-2025 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License 2.0 (the "License"). You may not use +// this file except in compliance with the License. You can obtain a copy +// in the file LICENSE in the source distribution or at +// https://www.openssl.org/source/license.html +// +// This module implements support for Armv8 SM3 instructions + +// $output is the last argument if it looks like a file (it has an extension) +// $flavour is the first argument if it doesn't look like a file +#include "arm_arch.h" +.text +.globl ossl_hwsm3_block_data_order +.type ossl_hwsm3_block_data_order,%function +.align 5 +ossl_hwsm3_block_data_order: + AARCH64_VALID_CALL_TARGET + // load state + ld1 {v5.4s,v6.4s}, [x0] + rev64 v5.4s, v5.4s + rev64 v6.4s, v6.4s + ext v5.16b, v5.16b, v5.16b, #8 + ext v6.16b, v6.16b, v6.16b, #8 + adrp x8, .Tj + add x8, x8, #:lo12:.Tj + ldp s16, s17, [x8] + +.Loop: + // load input + ld1 {v0.4s,v1.4s,v2.4s,v3.4s}, [x1], #64 + sub w2, w2, #1 + + mov v18.16b, v5.16b + mov v19.16b, v6.16b + +#ifndef __AARCH64EB__ + rev32 v0.16b, v0.16b + rev32 v1.16b, v1.16b + rev32 v2.16b, v2.16b + rev32 v3.16b, v3.16b +#endif + + ext v20.16b, v16.16b, v16.16b, #4 + // s4 = w7 | w8 | w9 | w10 + ext v4.16b, v1.16b, v2.16b, #12 + // vtmp1 = w3 | w4 | w5 | w6 + ext v22.16b, v0.16b, v1.16b, #12 + // vtmp2 = w10 | w11 | w12 | w13 + ext v23.16b, v2.16b, v3.16b, #8 +.inst 0xce63c004 //sm3partw1 v4.4s, v0.4s, v3.4s +.inst 0xce76c6e4 //sm3partw2 v4.4s, v23.4s, v22.4s + eor v22.16b, v0.16b, v1.16b +.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 +.inst 0xce5682e5 //sm3tt1a v5.4s, v23.4s, v22.4s[0] +.inst 0xce408ae6 //sm3tt2a v6.4s, v23.4s, v0.4s[0] +.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 +.inst 0xce5692e5 //sm3tt1a v5.4s, v23.4s, v22.4s[1] +.inst 0xce409ae6 //sm3tt2a v6.4s, v23.4s, v0.4s[1] +.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 +.inst 0xce56a2e5 //sm3tt1a v5.4s, v23.4s, v22.4s[2] +.inst 0xce40aae6 //sm3tt2a v6.4s, v23.4s, v0.4s[2] +.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 +.inst 0xce56b2e5 //sm3tt1a v5.4s, v23.4s, v22.4s[3] +.inst 0xce40bae6 //sm3tt2a v6.4s, v23.4s, v0.4s[3] + // s4 = w7 | w8 | w9 | w10 + ext v0.16b, v2.16b, v3.16b, #12 + // vtmp1 = w3 | w4 | w5 | w6 + ext v22.16b, v1.16b, v2.16b, #12 + // vtmp2 = w10 | w11 | w12 | w13 + ext v23.16b, v3.16b, v4.16b, #8 +.inst 0xce64c020 //sm3partw1 v0.4s, v1.4s, v4.4s +.inst 0xce76c6e0 //sm3partw2 v0.4s, v23.4s, v22.4s + eor v22.16b, v1.16b, v2.16b +.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 +.inst 0xce5682e5 //sm3tt1a v5.4s, v23.4s, v22.4s[0] +.inst 0xce418ae6 //sm3tt2a v6.4s, v23.4s, v1.4s[0] +.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 +.inst 0xce5692e5 //sm3tt1a v5.4s, v23.4s, v22.4s[1] +.inst 0xce419ae6 //sm3tt2a v6.4s, v23.4s, v1.4s[1] +.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 +.inst 0xce56a2e5 //sm3tt1a v5.4s, v23.4s, v22.4s[2] +.inst 0xce41aae6 //sm3tt2a v6.4s, v23.4s, v1.4s[2] +.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 +.inst 0xce56b2e5 //sm3tt1a v5.4s, v23.4s, v22.4s[3] +.inst 0xce41bae6 //sm3tt2a v6.4s, v23.4s, v1.4s[3] + // s4 = w7 | w8 | w9 | w10 + ext v1.16b, v3.16b, v4.16b, #12 + // vtmp1 = w3 | w4 | w5 | w6 + ext v22.16b, v2.16b, v3.16b, #12 + // vtmp2 = w10 | w11 | w12 | w13 + ext v23.16b, v4.16b, v0.16b, #8 +.inst 0xce60c041 //sm3partw1 v1.4s, v2.4s, v0.4s +.inst 0xce76c6e1 //sm3partw2 v1.4s, v23.4s, v22.4s + eor v22.16b, v2.16b, v3.16b +.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 +.inst 0xce5682e5 //sm3tt1a v5.4s, v23.4s, v22.4s[0] +.inst 0xce428ae6 //sm3tt2a v6.4s, v23.4s, v2.4s[0] +.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 +.inst 0xce5692e5 //sm3tt1a v5.4s, v23.4s, v22.4s[1] +.inst 0xce429ae6 //sm3tt2a v6.4s, v23.4s, v2.4s[1] +.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 +.inst 0xce56a2e5 //sm3tt1a v5.4s, v23.4s, v22.4s[2] +.inst 0xce42aae6 //sm3tt2a v6.4s, v23.4s, v2.4s[2] +.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 +.inst 0xce56b2e5 //sm3tt1a v5.4s, v23.4s, v22.4s[3] +.inst 0xce42bae6 //sm3tt2a v6.4s, v23.4s, v2.4s[3] + // s4 = w7 | w8 | w9 | w10 + ext v2.16b, v4.16b, v0.16b, #12 + // vtmp1 = w3 | w4 | w5 | w6 + ext v22.16b, v3.16b, v4.16b, #12 + // vtmp2 = w10 | w11 | w12 | w13 + ext v23.16b, v0.16b, v1.16b, #8 +.inst 0xce61c062 //sm3partw1 v2.4s, v3.4s, v1.4s +.inst 0xce76c6e2 //sm3partw2 v2.4s, v23.4s, v22.4s + eor v22.16b, v3.16b, v4.16b +.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 +.inst 0xce5682e5 //sm3tt1a v5.4s, v23.4s, v22.4s[0] +.inst 0xce438ae6 //sm3tt2a v6.4s, v23.4s, v3.4s[0] +.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 +.inst 0xce5692e5 //sm3tt1a v5.4s, v23.4s, v22.4s[1] +.inst 0xce439ae6 //sm3tt2a v6.4s, v23.4s, v3.4s[1] +.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 +.inst 0xce56a2e5 //sm3tt1a v5.4s, v23.4s, v22.4s[2] +.inst 0xce43aae6 //sm3tt2a v6.4s, v23.4s, v3.4s[2] +.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 +.inst 0xce56b2e5 //sm3tt1a v5.4s, v23.4s, v22.4s[3] +.inst 0xce43bae6 //sm3tt2a v6.4s, v23.4s, v3.4s[3] + ext v20.16b, v17.16b, v17.16b, #4 + // s4 = w7 | w8 | w9 | w10 + ext v3.16b, v0.16b, v1.16b, #12 + // vtmp1 = w3 | w4 | w5 | w6 + ext v22.16b, v4.16b, v0.16b, #12 + // vtmp2 = w10 | w11 | w12 | w13 + ext v23.16b, v1.16b, v2.16b, #8 +.inst 0xce62c083 //sm3partw1 v3.4s, v4.4s, v2.4s +.inst 0xce76c6e3 //sm3partw2 v3.4s, v23.4s, v22.4s + eor v22.16b, v4.16b, v0.16b +.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 +.inst 0xce5686e5 //sm3tt1b v5.4s, v23.4s, v22.4s[0] +.inst 0xce448ee6 //sm3tt2b v6.4s, v23.4s, v4.4s[0] +.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 +.inst 0xce5696e5 //sm3tt1b v5.4s, v23.4s, v22.4s[1] +.inst 0xce449ee6 //sm3tt2b v6.4s, v23.4s, v4.4s[1] +.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 +.inst 0xce56a6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[2] +.inst 0xce44aee6 //sm3tt2b v6.4s, v23.4s, v4.4s[2] +.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 +.inst 0xce56b6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[3] +.inst 0xce44bee6 //sm3tt2b v6.4s, v23.4s, v4.4s[3] + // s4 = w7 | w8 | w9 | w10 + ext v4.16b, v1.16b, v2.16b, #12 + // vtmp1 = w3 | w4 | w5 | w6 + ext v22.16b, v0.16b, v1.16b, #12 + // vtmp2 = w10 | w11 | w12 | w13 + ext v23.16b, v2.16b, v3.16b, #8 +.inst 0xce63c004 //sm3partw1 v4.4s, v0.4s, v3.4s +.inst 0xce76c6e4 //sm3partw2 v4.4s, v23.4s, v22.4s + eor v22.16b, v0.16b, v1.16b +.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 +.inst 0xce5686e5 //sm3tt1b v5.4s, v23.4s, v22.4s[0] +.inst 0xce408ee6 //sm3tt2b v6.4s, v23.4s, v0.4s[0] +.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 +.inst 0xce5696e5 //sm3tt1b v5.4s, v23.4s, v22.4s[1] +.inst 0xce409ee6 //sm3tt2b v6.4s, v23.4s, v0.4s[1] +.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 +.inst 0xce56a6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[2] +.inst 0xce40aee6 //sm3tt2b v6.4s, v23.4s, v0.4s[2] +.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 +.inst 0xce56b6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[3] +.inst 0xce40bee6 //sm3tt2b v6.4s, v23.4s, v0.4s[3] + // s4 = w7 | w8 | w9 | w10 + ext v0.16b, v2.16b, v3.16b, #12 + // vtmp1 = w3 | w4 | w5 | w6 + ext v22.16b, v1.16b, v2.16b, #12 + // vtmp2 = w10 | w11 | w12 | w13 + ext v23.16b, v3.16b, v4.16b, #8 +.inst 0xce64c020 //sm3partw1 v0.4s, v1.4s, v4.4s +.inst 0xce76c6e0 //sm3partw2 v0.4s, v23.4s, v22.4s + eor v22.16b, v1.16b, v2.16b +.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 +.inst 0xce5686e5 //sm3tt1b v5.4s, v23.4s, v22.4s[0] +.inst 0xce418ee6 //sm3tt2b v6.4s, v23.4s, v1.4s[0] +.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 +.inst 0xce5696e5 //sm3tt1b v5.4s, v23.4s, v22.4s[1] +.inst 0xce419ee6 //sm3tt2b v6.4s, v23.4s, v1.4s[1] +.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 +.inst 0xce56a6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[2] +.inst 0xce41aee6 //sm3tt2b v6.4s, v23.4s, v1.4s[2] +.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 +.inst 0xce56b6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[3] +.inst 0xce41bee6 //sm3tt2b v6.4s, v23.4s, v1.4s[3] + // s4 = w7 | w8 | w9 | w10 + ext v1.16b, v3.16b, v4.16b, #12 + // vtmp1 = w3 | w4 | w5 | w6 + ext v22.16b, v2.16b, v3.16b, #12 + // vtmp2 = w10 | w11 | w12 | w13 + ext v23.16b, v4.16b, v0.16b, #8 +.inst 0xce60c041 //sm3partw1 v1.4s, v2.4s, v0.4s +.inst 0xce76c6e1 //sm3partw2 v1.4s, v23.4s, v22.4s + eor v22.16b, v2.16b, v3.16b +.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 +.inst 0xce5686e5 //sm3tt1b v5.4s, v23.4s, v22.4s[0] +.inst 0xce428ee6 //sm3tt2b v6.4s, v23.4s, v2.4s[0] +.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 +.inst 0xce5696e5 //sm3tt1b v5.4s, v23.4s, v22.4s[1] +.inst 0xce429ee6 //sm3tt2b v6.4s, v23.4s, v2.4s[1] +.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 +.inst 0xce56a6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[2] +.inst 0xce42aee6 //sm3tt2b v6.4s, v23.4s, v2.4s[2] +.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 +.inst 0xce56b6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[3] +.inst 0xce42bee6 //sm3tt2b v6.4s, v23.4s, v2.4s[3] + // s4 = w7 | w8 | w9 | w10 + ext v2.16b, v4.16b, v0.16b, #12 + // vtmp1 = w3 | w4 | w5 | w6 + ext v22.16b, v3.16b, v4.16b, #12 + // vtmp2 = w10 | w11 | w12 | w13 + ext v23.16b, v0.16b, v1.16b, #8 +.inst 0xce61c062 //sm3partw1 v2.4s, v3.4s, v1.4s +.inst 0xce76c6e2 //sm3partw2 v2.4s, v23.4s, v22.4s + eor v22.16b, v3.16b, v4.16b +.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 +.inst 0xce5686e5 //sm3tt1b v5.4s, v23.4s, v22.4s[0] +.inst 0xce438ee6 //sm3tt2b v6.4s, v23.4s, v3.4s[0] +.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 +.inst 0xce5696e5 //sm3tt1b v5.4s, v23.4s, v22.4s[1] +.inst 0xce439ee6 //sm3tt2b v6.4s, v23.4s, v3.4s[1] +.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 +.inst 0xce56a6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[2] +.inst 0xce43aee6 //sm3tt2b v6.4s, v23.4s, v3.4s[2] +.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 +.inst 0xce56b6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[3] +.inst 0xce43bee6 //sm3tt2b v6.4s, v23.4s, v3.4s[3] + // s4 = w7 | w8 | w9 | w10 + ext v3.16b, v0.16b, v1.16b, #12 + // vtmp1 = w3 | w4 | w5 | w6 + ext v22.16b, v4.16b, v0.16b, #12 + // vtmp2 = w10 | w11 | w12 | w13 + ext v23.16b, v1.16b, v2.16b, #8 +.inst 0xce62c083 //sm3partw1 v3.4s, v4.4s, v2.4s +.inst 0xce76c6e3 //sm3partw2 v3.4s, v23.4s, v22.4s + eor v22.16b, v4.16b, v0.16b +.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 +.inst 0xce5686e5 //sm3tt1b v5.4s, v23.4s, v22.4s[0] +.inst 0xce448ee6 //sm3tt2b v6.4s, v23.4s, v4.4s[0] +.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 +.inst 0xce5696e5 //sm3tt1b v5.4s, v23.4s, v22.4s[1] +.inst 0xce449ee6 //sm3tt2b v6.4s, v23.4s, v4.4s[1] +.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 +.inst 0xce56a6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[2] +.inst 0xce44aee6 //sm3tt2b v6.4s, v23.4s, v4.4s[2] +.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 +.inst 0xce56b6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[3] +.inst 0xce44bee6 //sm3tt2b v6.4s, v23.4s, v4.4s[3] + // s4 = w7 | w8 | w9 | w10 + ext v4.16b, v1.16b, v2.16b, #12 + // vtmp1 = w3 | w4 | w5 | w6 + ext v22.16b, v0.16b, v1.16b, #12 + // vtmp2 = w10 | w11 | w12 | w13 + ext v23.16b, v2.16b, v3.16b, #8 +.inst 0xce63c004 //sm3partw1 v4.4s, v0.4s, v3.4s +.inst 0xce76c6e4 //sm3partw2 v4.4s, v23.4s, v22.4s + eor v22.16b, v0.16b, v1.16b +.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 +.inst 0xce5686e5 //sm3tt1b v5.4s, v23.4s, v22.4s[0] +.inst 0xce408ee6 //sm3tt2b v6.4s, v23.4s, v0.4s[0] +.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 +.inst 0xce5696e5 //sm3tt1b v5.4s, v23.4s, v22.4s[1] +.inst 0xce409ee6 //sm3tt2b v6.4s, v23.4s, v0.4s[1] +.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 +.inst 0xce56a6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[2] +.inst 0xce40aee6 //sm3tt2b v6.4s, v23.4s, v0.4s[2] +.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 +.inst 0xce56b6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[3] +.inst 0xce40bee6 //sm3tt2b v6.4s, v23.4s, v0.4s[3] + // s4 = w7 | w8 | w9 | w10 + ext v0.16b, v2.16b, v3.16b, #12 + // vtmp1 = w3 | w4 | w5 | w6 + ext v22.16b, v1.16b, v2.16b, #12 + // vtmp2 = w10 | w11 | w12 | w13 + ext v23.16b, v3.16b, v4.16b, #8 +.inst 0xce64c020 //sm3partw1 v0.4s, v1.4s, v4.4s +.inst 0xce76c6e0 //sm3partw2 v0.4s, v23.4s, v22.4s + eor v22.16b, v1.16b, v2.16b +.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 +.inst 0xce5686e5 //sm3tt1b v5.4s, v23.4s, v22.4s[0] +.inst 0xce418ee6 //sm3tt2b v6.4s, v23.4s, v1.4s[0] +.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 +.inst 0xce5696e5 //sm3tt1b v5.4s, v23.4s, v22.4s[1] +.inst 0xce419ee6 //sm3tt2b v6.4s, v23.4s, v1.4s[1] +.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 +.inst 0xce56a6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[2] +.inst 0xce41aee6 //sm3tt2b v6.4s, v23.4s, v1.4s[2] +.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 +.inst 0xce56b6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[3] +.inst 0xce41bee6 //sm3tt2b v6.4s, v23.4s, v1.4s[3] + // s4 = w7 | w8 | w9 | w10 + ext v1.16b, v3.16b, v4.16b, #12 + // vtmp1 = w3 | w4 | w5 | w6 + ext v22.16b, v2.16b, v3.16b, #12 + // vtmp2 = w10 | w11 | w12 | w13 + ext v23.16b, v4.16b, v0.16b, #8 +.inst 0xce60c041 //sm3partw1 v1.4s, v2.4s, v0.4s +.inst 0xce76c6e1 //sm3partw2 v1.4s, v23.4s, v22.4s + eor v22.16b, v2.16b, v3.16b +.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 +.inst 0xce5686e5 //sm3tt1b v5.4s, v23.4s, v22.4s[0] +.inst 0xce428ee6 //sm3tt2b v6.4s, v23.4s, v2.4s[0] +.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 +.inst 0xce5696e5 //sm3tt1b v5.4s, v23.4s, v22.4s[1] +.inst 0xce429ee6 //sm3tt2b v6.4s, v23.4s, v2.4s[1] +.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 +.inst 0xce56a6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[2] +.inst 0xce42aee6 //sm3tt2b v6.4s, v23.4s, v2.4s[2] +.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 +.inst 0xce56b6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[3] +.inst 0xce42bee6 //sm3tt2b v6.4s, v23.4s, v2.4s[3] + eor v22.16b, v3.16b, v4.16b +.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 +.inst 0xce5686e5 //sm3tt1b v5.4s, v23.4s, v22.4s[0] +.inst 0xce438ee6 //sm3tt2b v6.4s, v23.4s, v3.4s[0] +.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 +.inst 0xce5696e5 //sm3tt1b v5.4s, v23.4s, v22.4s[1] +.inst 0xce439ee6 //sm3tt2b v6.4s, v23.4s, v3.4s[1] +.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 +.inst 0xce56a6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[2] +.inst 0xce43aee6 //sm3tt2b v6.4s, v23.4s, v3.4s[2] +.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 +.inst 0xce56b6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[3] +.inst 0xce43bee6 //sm3tt2b v6.4s, v23.4s, v3.4s[3] + eor v22.16b, v4.16b, v0.16b +.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 +.inst 0xce5686e5 //sm3tt1b v5.4s, v23.4s, v22.4s[0] +.inst 0xce448ee6 //sm3tt2b v6.4s, v23.4s, v4.4s[0] +.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 +.inst 0xce5696e5 //sm3tt1b v5.4s, v23.4s, v22.4s[1] +.inst 0xce449ee6 //sm3tt2b v6.4s, v23.4s, v4.4s[1] +.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 +.inst 0xce56a6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[2] +.inst 0xce44aee6 //sm3tt2b v6.4s, v23.4s, v4.4s[2] +.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 +.inst 0xce56b6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[3] +.inst 0xce44bee6 //sm3tt2b v6.4s, v23.4s, v4.4s[3] + eor v22.16b, v0.16b, v1.16b +.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 +.inst 0xce5686e5 //sm3tt1b v5.4s, v23.4s, v22.4s[0] +.inst 0xce408ee6 //sm3tt2b v6.4s, v23.4s, v0.4s[0] +.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 +.inst 0xce5696e5 //sm3tt1b v5.4s, v23.4s, v22.4s[1] +.inst 0xce409ee6 //sm3tt2b v6.4s, v23.4s, v0.4s[1] +.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 +.inst 0xce56a6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[2] +.inst 0xce40aee6 //sm3tt2b v6.4s, v23.4s, v0.4s[2] +.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 +.inst 0xce56b6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[3] +.inst 0xce40bee6 //sm3tt2b v6.4s, v23.4s, v0.4s[3] + eor v5.16b, v5.16b, v18.16b + eor v6.16b, v6.16b, v19.16b + + // any remained blocks? + cbnz w2, .Loop + + // save state + rev64 v5.4s, v5.4s + rev64 v6.4s, v6.4s + ext v5.16b, v5.16b, v5.16b, #8 + ext v6.16b, v6.16b, v6.16b, #8 + st1 {v5.4s,v6.4s}, [x0] + ret +.size ossl_hwsm3_block_data_order,.-ossl_hwsm3_block_data_order +.section .rodata + +.type _sm3_consts,%object +.align 3 +_sm3_consts: +.Tj: +.word 0x79cc4519, 0x9d8a7a87 +.size _sm3_consts,.-_sm3_consts +.previous diff --git a/sys/crypto/openssl/aarch64/sm4-armv8.S b/sys/crypto/openssl/aarch64/sm4-armv8.S new file mode 100644 index 000000000000..4d3aa3cd70b3 --- /dev/null +++ b/sys/crypto/openssl/aarch64/sm4-armv8.S @@ -0,0 +1,1093 @@ +/* Do not modify. This file is auto-generated from sm4-armv8.pl. */ +// Copyright 2022-2025 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License 2.0 (the "License"). You may not use +// this file except in compliance with the License. You can obtain a copy +// in the file LICENSE in the source distribution or at +// https://www.openssl.org/source/license.html + +// +// This module implements support for SM4 hw support on aarch64 +// Oct 2021 +// + +// $output is the last argument if it looks like a file (it has an extension) +// $flavour is the first argument if it doesn't look like a file +#include "arm_arch.h" +.arch armv8-a+crypto +.text + +.section .rodata +.type _sm4_v8_consts,%object +.align 6 +_sm4_v8_consts: +.Lck: +.long 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269 +.long 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9 +.long 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249 +.long 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9 +.long 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229 +.long 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299 +.long 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209 +.long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279 +.Lfk: +.long 0xa3b1bac6, 0x56aa3350, 0x677d9197, 0xb27022dc +.size _sm4_v8_consts,.-_sm4_v8_consts +.previous + +.globl sm4_v8_set_encrypt_key +.type sm4_v8_set_encrypt_key,%function +.align 5 +sm4_v8_set_encrypt_key: + AARCH64_VALID_CALL_TARGET + ld1 {v0.4s},[x0] + adrp x2,.Lfk + add x2,x2,#:lo12:.Lfk + ld1 {v24.4s},[x2] + adrp x2,.Lck + add x2,x2,#:lo12:.Lck + ld1 {v16.4s,v17.4s,v18.4s,v19.4s},[x2],64 +#ifndef __AARCH64EB__ + rev32 v0.16b,v0.16b +#endif + ld1 {v20.4s,v21.4s,v22.4s,v23.4s},[x2] + eor v0.16b,v0.16b,v24.16b; +.inst 0xce70c800 //sm4ekey v0.4S,v0.4S,v16.4S +.inst 0xce71c801 //sm4ekey v1.4S,v0.4S,v17.4S +.inst 0xce72c822 //sm4ekey v2.4S,v1.4S,v18.4S +.inst 0xce73c843 //sm4ekey v3.4S,v2.4S,v19.4S +.inst 0xce74c864 //sm4ekey v4.4S,v3.4S,v20.4S + st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],64 +.inst 0xce75c885 //sm4ekey v5.4S,v4.4S,v21.4S +.inst 0xce76c8a6 //sm4ekey v6.4S,v5.4S,v22.4S +.inst 0xce77c8c7 //sm4ekey v7.4S,v6.4S,v23.4S + st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x1] + ret +.size sm4_v8_set_encrypt_key,.-sm4_v8_set_encrypt_key +.globl sm4_v8_set_decrypt_key +.type sm4_v8_set_decrypt_key,%function +.align 5 +sm4_v8_set_decrypt_key: + AARCH64_VALID_CALL_TARGET + ld1 {v7.4s},[x0] + adrp x2,.Lfk + add x2,x2,#:lo12:.Lfk + ld1 {v24.4s},[x2] + adrp x2,.Lck + add x2,x2,#:lo12:.Lck + ld1 {v16.4s,v17.4s,v18.4s,v19.4s},[x2],64 +#ifndef __AARCH64EB__ + rev32 v7.16b,v7.16b +#endif + ld1 {v20.4s,v21.4s,v22.4s,v23.4s},[x2] + eor v7.16b, v7.16b,v24.16b; +.inst 0xce70c8e7 //sm4ekey v7.4S,v7.4S,v16.4S +.inst 0xce71c8e6 //sm4ekey v6.4S,v7.4S,v17.4S +.inst 0xce72c8c5 //sm4ekey v5.4S,v6.4S,v18.4S + rev64 v7.4s,v7.4s + rev64 v6.4s,v6.4s + ext v7.16b,v7.16b,v7.16b,#8 + ext v6.16b,v6.16b,v6.16b,#8 +.inst 0xce73c8a4 //sm4ekey v4.4S,v5.4S,v19.4S +.inst 0xce74c883 //sm4ekey v3.4S,v4.4S,v20.4S + rev64 v5.4s,v5.4s + rev64 v4.4s,v4.4s + ext v5.16b,v5.16b,v5.16b,#8 + ext v4.16b,v4.16b,v4.16b,#8 +.inst 0xce75c862 //sm4ekey v2.4S,v3.4S,v21.4S +.inst 0xce76c841 //sm4ekey v1.4S,v2.4S,v22.4S + rev64 v3.4s,v3.4s + rev64 v2.4s,v2.4s + ext v3.16b,v3.16b,v3.16b,#8 + ext v2.16b,v2.16b,v2.16b,#8 +.inst 0xce77c820 //sm4ekey v0.4S,v1.4S,v23.4S + rev64 v1.4s, v1.4s + rev64 v0.4s, v0.4s + ext v1.16b,v1.16b,v1.16b,#8 + ext v0.16b,v0.16b,v0.16b,#8 + st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],64 + st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x1] + ret +.size sm4_v8_set_decrypt_key,.-sm4_v8_set_decrypt_key +.globl sm4_v8_encrypt +.type sm4_v8_encrypt,%function +.align 5 +sm4_v8_encrypt: + AARCH64_VALID_CALL_TARGET + ld1 {v16.4s},[x0] + ld1 {v0.4s,v1.4s,v2.4s,v3.4s},[x2],64 + ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x2] +#ifndef __AARCH64EB__ + rev32 v16.16b,v16.16b +#endif +.inst 0xcec08410 //sm4e v16.4s,v0.4s +.inst 0xcec08430 //sm4e v16.4s,v1.4s +.inst 0xcec08450 //sm4e v16.4s,v2.4s +.inst 0xcec08470 //sm4e v16.4s,v3.4s +.inst 0xcec08490 //sm4e v16.4s,v4.4s +.inst 0xcec084b0 //sm4e v16.4s,v5.4s +.inst 0xcec084d0 //sm4e v16.4s,v6.4s +.inst 0xcec084f0 //sm4e v16.4s,v7.4s + rev64 v16.4S,v16.4S + ext v16.16b,v16.16b,v16.16b,#8 +#ifndef __AARCH64EB__ + rev32 v16.16b,v16.16b +#endif + st1 {v16.4s},[x1] + ret +.size sm4_v8_encrypt,.-sm4_v8_encrypt +.globl sm4_v8_decrypt +.type sm4_v8_decrypt,%function +.align 5 +sm4_v8_decrypt: + AARCH64_VALID_CALL_TARGET + ld1 {v16.4s},[x0] + ld1 {v0.4s,v1.4s,v2.4s,v3.4s},[x2],64 + ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x2] +#ifndef __AARCH64EB__ + rev32 v16.16b,v16.16b +#endif +.inst 0xcec08410 //sm4e v16.4s,v0.4s +.inst 0xcec08430 //sm4e v16.4s,v1.4s +.inst 0xcec08450 //sm4e v16.4s,v2.4s +.inst 0xcec08470 //sm4e v16.4s,v3.4s +.inst 0xcec08490 //sm4e v16.4s,v4.4s +.inst 0xcec084b0 //sm4e v16.4s,v5.4s +.inst 0xcec084d0 //sm4e v16.4s,v6.4s +.inst 0xcec084f0 //sm4e v16.4s,v7.4s + rev64 v16.4S,v16.4S + ext v16.16b,v16.16b,v16.16b,#8 +#ifndef __AARCH64EB__ + rev32 v16.16b,v16.16b +#endif + st1 {v16.4s},[x1] + ret +.size sm4_v8_decrypt,.-sm4_v8_decrypt +.globl sm4_v8_ecb_encrypt +.type sm4_v8_ecb_encrypt,%function +.align 5 +sm4_v8_ecb_encrypt: + AARCH64_VALID_CALL_TARGET + ld1 {v0.4s,v1.4s,v2.4s,v3.4s},[x3],#64 + ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x3] +1: + cmp x2,#64 + b.lt 1f + ld1 {v16.4s,v17.4s,v18.4s,v19.4s},[x0],#64 + cmp x2,#128 + b.lt 2f + ld1 {v20.4s,v21.4s,v22.4s,v23.4s},[x0],#64 + // 8 blocks +#ifndef __AARCH64EB__ + rev32 v16.16b,v16.16b +#endif +#ifndef __AARCH64EB__ + rev32 v17.16b,v17.16b +#endif +#ifndef __AARCH64EB__ + rev32 v18.16b,v18.16b +#endif +#ifndef __AARCH64EB__ + rev32 v19.16b,v19.16b +#endif +#ifndef __AARCH64EB__ + rev32 v20.16b,v20.16b +#endif +#ifndef __AARCH64EB__ + rev32 v21.16b,v21.16b +#endif +#ifndef __AARCH64EB__ + rev32 v22.16b,v22.16b +#endif +#ifndef __AARCH64EB__ + rev32 v23.16b,v23.16b +#endif +.inst 0xcec08410 //sm4e v16.4s,v0.4s +.inst 0xcec08411 //sm4e v17.4s,v0.4s +.inst 0xcec08412 //sm4e v18.4s,v0.4s +.inst 0xcec08413 //sm4e v19.4s,v0.4s + +.inst 0xcec08430 //sm4e v16.4s,v1.4s +.inst 0xcec08431 //sm4e v17.4s,v1.4s +.inst 0xcec08432 //sm4e v18.4s,v1.4s +.inst 0xcec08433 //sm4e v19.4s,v1.4s + +.inst 0xcec08450 //sm4e v16.4s,v2.4s +.inst 0xcec08451 //sm4e v17.4s,v2.4s +.inst 0xcec08452 //sm4e v18.4s,v2.4s +.inst 0xcec08453 //sm4e v19.4s,v2.4s + +.inst 0xcec08470 //sm4e v16.4s,v3.4s +.inst 0xcec08471 //sm4e v17.4s,v3.4s +.inst 0xcec08472 //sm4e v18.4s,v3.4s +.inst 0xcec08473 //sm4e v19.4s,v3.4s + +.inst 0xcec08490 //sm4e v16.4s,v4.4s +.inst 0xcec08491 //sm4e v17.4s,v4.4s +.inst 0xcec08492 //sm4e v18.4s,v4.4s +.inst 0xcec08493 //sm4e v19.4s,v4.4s + +.inst 0xcec084b0 //sm4e v16.4s,v5.4s +.inst 0xcec084b1 //sm4e v17.4s,v5.4s +.inst 0xcec084b2 //sm4e v18.4s,v5.4s +.inst 0xcec084b3 //sm4e v19.4s,v5.4s + +.inst 0xcec084d0 //sm4e v16.4s,v6.4s +.inst 0xcec084d1 //sm4e v17.4s,v6.4s +.inst 0xcec084d2 //sm4e v18.4s,v6.4s +.inst 0xcec084d3 //sm4e v19.4s,v6.4s + +.inst 0xcec084f0 //sm4e v16.4s,v7.4s + rev64 v16.4S,v16.4S +.inst 0xcec084f1 //sm4e v17.4s,v7.4s + ext v16.16b,v16.16b,v16.16b,#8 + rev64 v17.4S,v17.4S +.inst 0xcec084f2 //sm4e v18.4s,v7.4s + ext v17.16b,v17.16b,v17.16b,#8 + rev64 v18.4S,v18.4S +.inst 0xcec084f3 //sm4e v19.4s,v7.4s + ext v18.16b,v18.16b,v18.16b,#8 + rev64 v19.4S,v19.4S + ext v19.16b,v19.16b,v19.16b,#8 +.inst 0xcec08414 //sm4e v20.4s,v0.4s +.inst 0xcec08415 //sm4e v21.4s,v0.4s +.inst 0xcec08416 //sm4e v22.4s,v0.4s +.inst 0xcec08417 //sm4e v23.4s,v0.4s + +.inst 0xcec08434 //sm4e v20.4s,v1.4s +.inst 0xcec08435 //sm4e v21.4s,v1.4s +.inst 0xcec08436 //sm4e v22.4s,v1.4s +.inst 0xcec08437 //sm4e v23.4s,v1.4s + +.inst 0xcec08454 //sm4e v20.4s,v2.4s +.inst 0xcec08455 //sm4e v21.4s,v2.4s +.inst 0xcec08456 //sm4e v22.4s,v2.4s +.inst 0xcec08457 //sm4e v23.4s,v2.4s + +.inst 0xcec08474 //sm4e v20.4s,v3.4s +.inst 0xcec08475 //sm4e v21.4s,v3.4s +.inst 0xcec08476 //sm4e v22.4s,v3.4s +.inst 0xcec08477 //sm4e v23.4s,v3.4s + +.inst 0xcec08494 //sm4e v20.4s,v4.4s +.inst 0xcec08495 //sm4e v21.4s,v4.4s +.inst 0xcec08496 //sm4e v22.4s,v4.4s +.inst 0xcec08497 //sm4e v23.4s,v4.4s + +.inst 0xcec084b4 //sm4e v20.4s,v5.4s +.inst 0xcec084b5 //sm4e v21.4s,v5.4s +.inst 0xcec084b6 //sm4e v22.4s,v5.4s +.inst 0xcec084b7 //sm4e v23.4s,v5.4s + +.inst 0xcec084d4 //sm4e v20.4s,v6.4s +.inst 0xcec084d5 //sm4e v21.4s,v6.4s +.inst 0xcec084d6 //sm4e v22.4s,v6.4s +.inst 0xcec084d7 //sm4e v23.4s,v6.4s + +.inst 0xcec084f4 //sm4e v20.4s,v7.4s + rev64 v20.4S,v20.4S +.inst 0xcec084f5 //sm4e v21.4s,v7.4s + ext v20.16b,v20.16b,v20.16b,#8 + rev64 v21.4S,v21.4S +.inst 0xcec084f6 //sm4e v22.4s,v7.4s + ext v21.16b,v21.16b,v21.16b,#8 + rev64 v22.4S,v22.4S +.inst 0xcec084f7 //sm4e v23.4s,v7.4s + ext v22.16b,v22.16b,v22.16b,#8 + rev64 v23.4S,v23.4S + ext v23.16b,v23.16b,v23.16b,#8 +#ifndef __AARCH64EB__ + rev32 v16.16b,v16.16b +#endif +#ifndef __AARCH64EB__ + rev32 v17.16b,v17.16b +#endif +#ifndef __AARCH64EB__ + rev32 v18.16b,v18.16b +#endif +#ifndef __AARCH64EB__ + rev32 v19.16b,v19.16b +#endif +#ifndef __AARCH64EB__ + rev32 v20.16b,v20.16b +#endif +#ifndef __AARCH64EB__ + rev32 v21.16b,v21.16b +#endif + st1 {v16.4s,v17.4s,v18.4s,v19.4s},[x1],#64 +#ifndef __AARCH64EB__ + rev32 v22.16b,v22.16b +#endif +#ifndef __AARCH64EB__ + rev32 v23.16b,v23.16b +#endif + st1 {v20.4s,v21.4s,v22.4s,v23.4s},[x1],#64 + subs x2,x2,#128 + b.gt 1b + ret + // 4 blocks +2: +#ifndef __AARCH64EB__ + rev32 v16.16b,v16.16b +#endif +#ifndef __AARCH64EB__ + rev32 v17.16b,v17.16b +#endif +#ifndef __AARCH64EB__ + rev32 v18.16b,v18.16b +#endif +#ifndef __AARCH64EB__ + rev32 v19.16b,v19.16b +#endif +.inst 0xcec08410 //sm4e v16.4s,v0.4s +.inst 0xcec08411 //sm4e v17.4s,v0.4s +.inst 0xcec08412 //sm4e v18.4s,v0.4s +.inst 0xcec08413 //sm4e v19.4s,v0.4s + +.inst 0xcec08430 //sm4e v16.4s,v1.4s +.inst 0xcec08431 //sm4e v17.4s,v1.4s +.inst 0xcec08432 //sm4e v18.4s,v1.4s +.inst 0xcec08433 //sm4e v19.4s,v1.4s + +.inst 0xcec08450 //sm4e v16.4s,v2.4s +.inst 0xcec08451 //sm4e v17.4s,v2.4s +.inst 0xcec08452 //sm4e v18.4s,v2.4s +.inst 0xcec08453 //sm4e v19.4s,v2.4s + +.inst 0xcec08470 //sm4e v16.4s,v3.4s +.inst 0xcec08471 //sm4e v17.4s,v3.4s +.inst 0xcec08472 //sm4e v18.4s,v3.4s +.inst 0xcec08473 //sm4e v19.4s,v3.4s + +.inst 0xcec08490 //sm4e v16.4s,v4.4s +.inst 0xcec08491 //sm4e v17.4s,v4.4s +.inst 0xcec08492 //sm4e v18.4s,v4.4s +.inst 0xcec08493 //sm4e v19.4s,v4.4s + +.inst 0xcec084b0 //sm4e v16.4s,v5.4s +.inst 0xcec084b1 //sm4e v17.4s,v5.4s +.inst 0xcec084b2 //sm4e v18.4s,v5.4s +.inst 0xcec084b3 //sm4e v19.4s,v5.4s + +.inst 0xcec084d0 //sm4e v16.4s,v6.4s +.inst 0xcec084d1 //sm4e v17.4s,v6.4s +.inst 0xcec084d2 //sm4e v18.4s,v6.4s +.inst 0xcec084d3 //sm4e v19.4s,v6.4s + +.inst 0xcec084f0 //sm4e v16.4s,v7.4s + rev64 v16.4S,v16.4S +.inst 0xcec084f1 //sm4e v17.4s,v7.4s + ext v16.16b,v16.16b,v16.16b,#8 + rev64 v17.4S,v17.4S +.inst 0xcec084f2 //sm4e v18.4s,v7.4s + ext v17.16b,v17.16b,v17.16b,#8 + rev64 v18.4S,v18.4S +.inst 0xcec084f3 //sm4e v19.4s,v7.4s + ext v18.16b,v18.16b,v18.16b,#8 + rev64 v19.4S,v19.4S + ext v19.16b,v19.16b,v19.16b,#8 +#ifndef __AARCH64EB__ + rev32 v16.16b,v16.16b +#endif +#ifndef __AARCH64EB__ + rev32 v17.16b,v17.16b +#endif +#ifndef __AARCH64EB__ + rev32 v18.16b,v18.16b +#endif +#ifndef __AARCH64EB__ + rev32 v19.16b,v19.16b +#endif + st1 {v16.4s,v17.4s,v18.4s,v19.4s},[x1],#64 + subs x2,x2,#64 + b.gt 1b +1: + subs x2,x2,#16 + b.lt 1f + ld1 {v16.4s},[x0],#16 +#ifndef __AARCH64EB__ + rev32 v16.16b,v16.16b +#endif +.inst 0xcec08410 //sm4e v16.4s,v0.4s +.inst 0xcec08430 //sm4e v16.4s,v1.4s +.inst 0xcec08450 //sm4e v16.4s,v2.4s +.inst 0xcec08470 //sm4e v16.4s,v3.4s +.inst 0xcec08490 //sm4e v16.4s,v4.4s +.inst 0xcec084b0 //sm4e v16.4s,v5.4s +.inst 0xcec084d0 //sm4e v16.4s,v6.4s +.inst 0xcec084f0 //sm4e v16.4s,v7.4s + rev64 v16.4S,v16.4S + ext v16.16b,v16.16b,v16.16b,#8 +#ifndef __AARCH64EB__ + rev32 v16.16b,v16.16b +#endif + st1 {v16.4s},[x1],#16 + b.ne 1b +1: + ret +.size sm4_v8_ecb_encrypt,.-sm4_v8_ecb_encrypt +.globl sm4_v8_cbc_encrypt +.type sm4_v8_cbc_encrypt,%function +.align 5 +sm4_v8_cbc_encrypt: + AARCH64_VALID_CALL_TARGET + stp d8,d9,[sp, #-16]! + + ld1 {v0.4s,v1.4s,v2.4s,v3.4s},[x3],#64 + ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x3] + ld1 {v8.4s},[x4] + cmp w5,#0 + b.eq .Ldec +1: + cmp x2, #64 + b.lt 1f + ld1 {v16.4s,v17.4s,v18.4s,v19.4s},[x0],#64 + eor v16.16b,v16.16b,v8.16b +#ifndef __AARCH64EB__ + rev32 v17.16b,v17.16b +#endif +#ifndef __AARCH64EB__ + rev32 v16.16b,v16.16b +#endif +#ifndef __AARCH64EB__ + rev32 v18.16b,v18.16b +#endif +#ifndef __AARCH64EB__ + rev32 v19.16b,v19.16b +#endif +.inst 0xcec08410 //sm4e v16.4s,v0.4s +.inst 0xcec08430 //sm4e v16.4s,v1.4s +.inst 0xcec08450 //sm4e v16.4s,v2.4s +.inst 0xcec08470 //sm4e v16.4s,v3.4s +.inst 0xcec08490 //sm4e v16.4s,v4.4s +.inst 0xcec084b0 //sm4e v16.4s,v5.4s +.inst 0xcec084d0 //sm4e v16.4s,v6.4s +.inst 0xcec084f0 //sm4e v16.4s,v7.4s + rev64 v16.4S,v16.4S + ext v16.16b,v16.16b,v16.16b,#8 + eor v17.16b,v17.16b,v16.16b +.inst 0xcec08411 //sm4e v17.4s,v0.4s +.inst 0xcec08431 //sm4e v17.4s,v1.4s +.inst 0xcec08451 //sm4e v17.4s,v2.4s +.inst 0xcec08471 //sm4e v17.4s,v3.4s +.inst 0xcec08491 //sm4e v17.4s,v4.4s +.inst 0xcec084b1 //sm4e v17.4s,v5.4s +.inst 0xcec084d1 //sm4e v17.4s,v6.4s +.inst 0xcec084f1 //sm4e v17.4s,v7.4s + rev64 v17.4S,v17.4S + ext v17.16b,v17.16b,v17.16b,#8 +#ifndef __AARCH64EB__ + rev32 v16.16b,v16.16b +#endif + eor v18.16b,v18.16b,v17.16b +.inst 0xcec08412 //sm4e v18.4s,v0.4s +.inst 0xcec08432 //sm4e v18.4s,v1.4s +.inst 0xcec08452 //sm4e v18.4s,v2.4s +.inst 0xcec08472 //sm4e v18.4s,v3.4s +.inst 0xcec08492 //sm4e v18.4s,v4.4s +.inst 0xcec084b2 //sm4e v18.4s,v5.4s +.inst 0xcec084d2 //sm4e v18.4s,v6.4s +.inst 0xcec084f2 //sm4e v18.4s,v7.4s + rev64 v18.4S,v18.4S + ext v18.16b,v18.16b,v18.16b,#8 +#ifndef __AARCH64EB__ + rev32 v17.16b,v17.16b +#endif + eor v19.16b,v19.16b,v18.16b +.inst 0xcec08413 //sm4e v19.4s,v0.4s +.inst 0xcec08433 //sm4e v19.4s,v1.4s +.inst 0xcec08453 //sm4e v19.4s,v2.4s +.inst 0xcec08473 //sm4e v19.4s,v3.4s +.inst 0xcec08493 //sm4e v19.4s,v4.4s +.inst 0xcec084b3 //sm4e v19.4s,v5.4s +.inst 0xcec084d3 //sm4e v19.4s,v6.4s +.inst 0xcec084f3 //sm4e v19.4s,v7.4s + rev64 v19.4S,v19.4S + ext v19.16b,v19.16b,v19.16b,#8 +#ifndef __AARCH64EB__ + rev32 v18.16b,v18.16b +#endif +#ifndef __AARCH64EB__ + rev32 v19.16b,v19.16b +#endif + mov v8.16b,v19.16b + st1 {v16.4s,v17.4s,v18.4s,v19.4s},[x1],#64 + subs x2,x2,#64 + b.ne 1b +1: + subs x2,x2,#16 + b.lt 3f + ld1 {v16.4s},[x0],#16 + eor v8.16b,v8.16b,v16.16b +#ifndef __AARCH64EB__ + rev32 v8.16b,v8.16b +#endif +.inst 0xcec08408 //sm4e v8.4s,v0.4s +.inst 0xcec08428 //sm4e v8.4s,v1.4s +.inst 0xcec08448 //sm4e v8.4s,v2.4s +.inst 0xcec08468 //sm4e v8.4s,v3.4s +.inst 0xcec08488 //sm4e v8.4s,v4.4s +.inst 0xcec084a8 //sm4e v8.4s,v5.4s +.inst 0xcec084c8 //sm4e v8.4s,v6.4s +.inst 0xcec084e8 //sm4e v8.4s,v7.4s + rev64 v8.4S,v8.4S + ext v8.16b,v8.16b,v8.16b,#8 +#ifndef __AARCH64EB__ + rev32 v8.16b,v8.16b +#endif + st1 {v8.4s},[x1],#16 + b.ne 1b + b 3f +.Ldec: +1: + cmp x2, #64 + b.lt 1f + ld1 {v16.4s,v17.4s,v18.4s,v19.4s},[x0] + ld1 {v24.4s,v25.4s,v26.4s,v27.4s},[x0],#64 + cmp x2,#128 + b.lt 2f + // 8 blocks mode + ld1 {v20.4s,v21.4s,v22.4s,v23.4s},[x0] + ld1 {v28.4s,v29.4s,v30.4s,v31.4s},[x0],#64 +#ifndef __AARCH64EB__ + rev32 v16.16b,v16.16b +#endif +#ifndef __AARCH64EB__ + rev32 v17.16b,v17.16b +#endif +#ifndef __AARCH64EB__ + rev32 v18.16b,v18.16b +#endif +#ifndef __AARCH64EB__ + rev32 v19.16b,v19.16b +#endif +#ifndef __AARCH64EB__ + rev32 v20.16b,v20.16b +#endif +#ifndef __AARCH64EB__ + rev32 v21.16b,v21.16b +#endif +#ifndef __AARCH64EB__ + rev32 v22.16b,v22.16b +#endif +#ifndef __AARCH64EB__ + rev32 v23.16b,v23.16b +#endif +.inst 0xcec08410 //sm4e v16.4s,v0.4s +.inst 0xcec08411 //sm4e v17.4s,v0.4s +.inst 0xcec08412 //sm4e v18.4s,v0.4s +.inst 0xcec08413 //sm4e v19.4s,v0.4s + +.inst 0xcec08430 //sm4e v16.4s,v1.4s +.inst 0xcec08431 //sm4e v17.4s,v1.4s +.inst 0xcec08432 //sm4e v18.4s,v1.4s +.inst 0xcec08433 //sm4e v19.4s,v1.4s + +.inst 0xcec08450 //sm4e v16.4s,v2.4s +.inst 0xcec08451 //sm4e v17.4s,v2.4s +.inst 0xcec08452 //sm4e v18.4s,v2.4s +.inst 0xcec08453 //sm4e v19.4s,v2.4s + +.inst 0xcec08470 //sm4e v16.4s,v3.4s +.inst 0xcec08471 //sm4e v17.4s,v3.4s +.inst 0xcec08472 //sm4e v18.4s,v3.4s +.inst 0xcec08473 //sm4e v19.4s,v3.4s + +.inst 0xcec08490 //sm4e v16.4s,v4.4s +.inst 0xcec08491 //sm4e v17.4s,v4.4s +.inst 0xcec08492 //sm4e v18.4s,v4.4s +.inst 0xcec08493 //sm4e v19.4s,v4.4s + +.inst 0xcec084b0 //sm4e v16.4s,v5.4s +.inst 0xcec084b1 //sm4e v17.4s,v5.4s +.inst 0xcec084b2 //sm4e v18.4s,v5.4s +.inst 0xcec084b3 //sm4e v19.4s,v5.4s + +.inst 0xcec084d0 //sm4e v16.4s,v6.4s +.inst 0xcec084d1 //sm4e v17.4s,v6.4s +.inst 0xcec084d2 //sm4e v18.4s,v6.4s +.inst 0xcec084d3 //sm4e v19.4s,v6.4s + +.inst 0xcec084f0 //sm4e v16.4s,v7.4s + rev64 v16.4S,v16.4S +.inst 0xcec084f1 //sm4e v17.4s,v7.4s + ext v16.16b,v16.16b,v16.16b,#8 + rev64 v17.4S,v17.4S +.inst 0xcec084f2 //sm4e v18.4s,v7.4s + ext v17.16b,v17.16b,v17.16b,#8 + rev64 v18.4S,v18.4S +.inst 0xcec084f3 //sm4e v19.4s,v7.4s + ext v18.16b,v18.16b,v18.16b,#8 + rev64 v19.4S,v19.4S + ext v19.16b,v19.16b,v19.16b,#8 +.inst 0xcec08414 //sm4e v20.4s,v0.4s +.inst 0xcec08415 //sm4e v21.4s,v0.4s +.inst 0xcec08416 //sm4e v22.4s,v0.4s +.inst 0xcec08417 //sm4e v23.4s,v0.4s + +.inst 0xcec08434 //sm4e v20.4s,v1.4s +.inst 0xcec08435 //sm4e v21.4s,v1.4s +.inst 0xcec08436 //sm4e v22.4s,v1.4s +.inst 0xcec08437 //sm4e v23.4s,v1.4s + +.inst 0xcec08454 //sm4e v20.4s,v2.4s +.inst 0xcec08455 //sm4e v21.4s,v2.4s +.inst 0xcec08456 //sm4e v22.4s,v2.4s +.inst 0xcec08457 //sm4e v23.4s,v2.4s + +.inst 0xcec08474 //sm4e v20.4s,v3.4s +.inst 0xcec08475 //sm4e v21.4s,v3.4s +.inst 0xcec08476 //sm4e v22.4s,v3.4s +.inst 0xcec08477 //sm4e v23.4s,v3.4s + +.inst 0xcec08494 //sm4e v20.4s,v4.4s +.inst 0xcec08495 //sm4e v21.4s,v4.4s +.inst 0xcec08496 //sm4e v22.4s,v4.4s +.inst 0xcec08497 //sm4e v23.4s,v4.4s + +.inst 0xcec084b4 //sm4e v20.4s,v5.4s +.inst 0xcec084b5 //sm4e v21.4s,v5.4s +.inst 0xcec084b6 //sm4e v22.4s,v5.4s +.inst 0xcec084b7 //sm4e v23.4s,v5.4s + +.inst 0xcec084d4 //sm4e v20.4s,v6.4s +.inst 0xcec084d5 //sm4e v21.4s,v6.4s +.inst 0xcec084d6 //sm4e v22.4s,v6.4s +.inst 0xcec084d7 //sm4e v23.4s,v6.4s + +.inst 0xcec084f4 //sm4e v20.4s,v7.4s + rev64 v20.4S,v20.4S +.inst 0xcec084f5 //sm4e v21.4s,v7.4s + ext v20.16b,v20.16b,v20.16b,#8 + rev64 v21.4S,v21.4S +.inst 0xcec084f6 //sm4e v22.4s,v7.4s + ext v21.16b,v21.16b,v21.16b,#8 + rev64 v22.4S,v22.4S +.inst 0xcec084f7 //sm4e v23.4s,v7.4s + ext v22.16b,v22.16b,v22.16b,#8 + rev64 v23.4S,v23.4S + ext v23.16b,v23.16b,v23.16b,#8 +#ifndef __AARCH64EB__ + rev32 v16.16b,v16.16b +#endif +#ifndef __AARCH64EB__ + rev32 v17.16b,v17.16b +#endif +#ifndef __AARCH64EB__ + rev32 v18.16b,v18.16b +#endif +#ifndef __AARCH64EB__ + rev32 v19.16b,v19.16b +#endif +#ifndef __AARCH64EB__ + rev32 v20.16b,v20.16b +#endif +#ifndef __AARCH64EB__ + rev32 v21.16b,v21.16b +#endif +#ifndef __AARCH64EB__ + rev32 v22.16b,v22.16b +#endif +#ifndef __AARCH64EB__ + rev32 v23.16b,v23.16b +#endif + eor v16.16b,v16.16b,v8.16b + eor v17.16b,v17.16b,v24.16b + eor v18.16b,v18.16b,v25.16b + mov v8.16b,v31.16b + eor v19.16b,v19.16b,v26.16b + eor v20.16b,v20.16b,v27.16b + eor v21.16b,v21.16b,v28.16b + eor v22.16b,v22.16b,v29.16b + eor v23.16b,v23.16b,v30.16b + st1 {v16.4s,v17.4s,v18.4s,v19.4s},[x1],#64 + st1 {v20.4s,v21.4s,v22.4s,v23.4s},[x1],#64 + subs x2,x2,128 + b.gt 1b + b 3f + // 4 blocks mode +2: +#ifndef __AARCH64EB__ + rev32 v16.16b,v16.16b +#endif +#ifndef __AARCH64EB__ + rev32 v17.16b,v17.16b +#endif +#ifndef __AARCH64EB__ + rev32 v18.16b,v18.16b +#endif +#ifndef __AARCH64EB__ + rev32 v19.16b,v19.16b +#endif +.inst 0xcec08410 //sm4e v16.4s,v0.4s +.inst 0xcec08411 //sm4e v17.4s,v0.4s +.inst 0xcec08412 //sm4e v18.4s,v0.4s +.inst 0xcec08413 //sm4e v19.4s,v0.4s + +.inst 0xcec08430 //sm4e v16.4s,v1.4s +.inst 0xcec08431 //sm4e v17.4s,v1.4s +.inst 0xcec08432 //sm4e v18.4s,v1.4s +.inst 0xcec08433 //sm4e v19.4s,v1.4s + +.inst 0xcec08450 //sm4e v16.4s,v2.4s +.inst 0xcec08451 //sm4e v17.4s,v2.4s +.inst 0xcec08452 //sm4e v18.4s,v2.4s +.inst 0xcec08453 //sm4e v19.4s,v2.4s + +.inst 0xcec08470 //sm4e v16.4s,v3.4s +.inst 0xcec08471 //sm4e v17.4s,v3.4s +.inst 0xcec08472 //sm4e v18.4s,v3.4s +.inst 0xcec08473 //sm4e v19.4s,v3.4s + +.inst 0xcec08490 //sm4e v16.4s,v4.4s +.inst 0xcec08491 //sm4e v17.4s,v4.4s +.inst 0xcec08492 //sm4e v18.4s,v4.4s +.inst 0xcec08493 //sm4e v19.4s,v4.4s + +.inst 0xcec084b0 //sm4e v16.4s,v5.4s +.inst 0xcec084b1 //sm4e v17.4s,v5.4s +.inst 0xcec084b2 //sm4e v18.4s,v5.4s +.inst 0xcec084b3 //sm4e v19.4s,v5.4s + +.inst 0xcec084d0 //sm4e v16.4s,v6.4s +.inst 0xcec084d1 //sm4e v17.4s,v6.4s +.inst 0xcec084d2 //sm4e v18.4s,v6.4s +.inst 0xcec084d3 //sm4e v19.4s,v6.4s + +.inst 0xcec084f0 //sm4e v16.4s,v7.4s + rev64 v16.4S,v16.4S +.inst 0xcec084f1 //sm4e v17.4s,v7.4s + ext v16.16b,v16.16b,v16.16b,#8 + rev64 v17.4S,v17.4S +.inst 0xcec084f2 //sm4e v18.4s,v7.4s + ext v17.16b,v17.16b,v17.16b,#8 + rev64 v18.4S,v18.4S +.inst 0xcec084f3 //sm4e v19.4s,v7.4s + ext v18.16b,v18.16b,v18.16b,#8 + rev64 v19.4S,v19.4S + ext v19.16b,v19.16b,v19.16b,#8 +#ifndef __AARCH64EB__ + rev32 v16.16b,v16.16b +#endif +#ifndef __AARCH64EB__ + rev32 v17.16b,v17.16b +#endif +#ifndef __AARCH64EB__ + rev32 v18.16b,v18.16b +#endif +#ifndef __AARCH64EB__ + rev32 v19.16b,v19.16b +#endif + eor v16.16b,v16.16b,v8.16b + eor v17.16b,v17.16b,v24.16b + mov v8.16b,v27.16b + eor v18.16b,v18.16b,v25.16b + eor v19.16b,v19.16b,v26.16b + st1 {v16.4s,v17.4s,v18.4s,v19.4s},[x1],#64 + subs x2,x2,#64 + b.gt 1b +1: + subs x2,x2,#16 + b.lt 3f + ld1 {v16.4s},[x0],#16 + mov v24.16b,v16.16b +#ifndef __AARCH64EB__ + rev32 v16.16b,v16.16b +#endif +.inst 0xcec08410 //sm4e v16.4s,v0.4s +.inst 0xcec08430 //sm4e v16.4s,v1.4s +.inst 0xcec08450 //sm4e v16.4s,v2.4s +.inst 0xcec08470 //sm4e v16.4s,v3.4s +.inst 0xcec08490 //sm4e v16.4s,v4.4s +.inst 0xcec084b0 //sm4e v16.4s,v5.4s +.inst 0xcec084d0 //sm4e v16.4s,v6.4s +.inst 0xcec084f0 //sm4e v16.4s,v7.4s + rev64 v16.4S,v16.4S + ext v16.16b,v16.16b,v16.16b,#8 +#ifndef __AARCH64EB__ + rev32 v16.16b,v16.16b +#endif + eor v16.16b,v16.16b,v8.16b + mov v8.16b,v24.16b + st1 {v16.4s},[x1],#16 + b.ne 1b +3: + // save back IV + st1 {v8.4s},[x4] + ldp d8,d9,[sp],#16 + ret +.size sm4_v8_cbc_encrypt,.-sm4_v8_cbc_encrypt +.globl sm4_v8_ctr32_encrypt_blocks +.type sm4_v8_ctr32_encrypt_blocks,%function +.align 5 +sm4_v8_ctr32_encrypt_blocks: + AARCH64_VALID_CALL_TARGET + stp d8,d9,[sp, #-16]! + + ld1 {v8.4s},[x4] + ld1 {v0.4s,v1.4s,v2.4s,v3.4s},[x3],64 + ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x3] +#ifndef __AARCH64EB__ + rev32 v8.16b,v8.16b +#endif + mov w5,v8.s[3] +1: + cmp x2,#4 + b.lt 1f + ld1 {v24.4s,v25.4s,v26.4s,v27.4s},[x0],#64 + mov v16.16b,v8.16b + mov v17.16b,v8.16b + mov v18.16b,v8.16b + mov v19.16b,v8.16b + add w5,w5,#1 + mov v17.s[3],w5 + add w5,w5,#1 + mov v18.s[3],w5 + add w5,w5,#1 + mov v19.s[3],w5 + cmp x2,#8 + b.lt 2f + ld1 {v28.4s,v29.4s,v30.4s,v31.4s},[x0],#64 + mov v20.16b,v8.16b + mov v21.16b,v8.16b + mov v22.16b,v8.16b + mov v23.16b,v8.16b + add w5,w5,#1 + mov v20.s[3],w5 + add w5,w5,#1 + mov v21.s[3],w5 + add w5,w5,#1 + mov v22.s[3],w5 + add w5,w5,#1 + mov v23.s[3],w5 +.inst 0xcec08410 //sm4e v16.4s,v0.4s +.inst 0xcec08411 //sm4e v17.4s,v0.4s +.inst 0xcec08412 //sm4e v18.4s,v0.4s +.inst 0xcec08413 //sm4e v19.4s,v0.4s + +.inst 0xcec08430 //sm4e v16.4s,v1.4s +.inst 0xcec08431 //sm4e v17.4s,v1.4s +.inst 0xcec08432 //sm4e v18.4s,v1.4s +.inst 0xcec08433 //sm4e v19.4s,v1.4s + +.inst 0xcec08450 //sm4e v16.4s,v2.4s +.inst 0xcec08451 //sm4e v17.4s,v2.4s +.inst 0xcec08452 //sm4e v18.4s,v2.4s +.inst 0xcec08453 //sm4e v19.4s,v2.4s + +.inst 0xcec08470 //sm4e v16.4s,v3.4s +.inst 0xcec08471 //sm4e v17.4s,v3.4s +.inst 0xcec08472 //sm4e v18.4s,v3.4s +.inst 0xcec08473 //sm4e v19.4s,v3.4s + +.inst 0xcec08490 //sm4e v16.4s,v4.4s +.inst 0xcec08491 //sm4e v17.4s,v4.4s +.inst 0xcec08492 //sm4e v18.4s,v4.4s +.inst 0xcec08493 //sm4e v19.4s,v4.4s + +.inst 0xcec084b0 //sm4e v16.4s,v5.4s +.inst 0xcec084b1 //sm4e v17.4s,v5.4s +.inst 0xcec084b2 //sm4e v18.4s,v5.4s +.inst 0xcec084b3 //sm4e v19.4s,v5.4s + +.inst 0xcec084d0 //sm4e v16.4s,v6.4s +.inst 0xcec084d1 //sm4e v17.4s,v6.4s +.inst 0xcec084d2 //sm4e v18.4s,v6.4s +.inst 0xcec084d3 //sm4e v19.4s,v6.4s + +.inst 0xcec084f0 //sm4e v16.4s,v7.4s + rev64 v16.4S,v16.4S +.inst 0xcec084f1 //sm4e v17.4s,v7.4s + ext v16.16b,v16.16b,v16.16b,#8 + rev64 v17.4S,v17.4S +.inst 0xcec084f2 //sm4e v18.4s,v7.4s + ext v17.16b,v17.16b,v17.16b,#8 + rev64 v18.4S,v18.4S +.inst 0xcec084f3 //sm4e v19.4s,v7.4s + ext v18.16b,v18.16b,v18.16b,#8 + rev64 v19.4S,v19.4S + ext v19.16b,v19.16b,v19.16b,#8 +.inst 0xcec08414 //sm4e v20.4s,v0.4s +.inst 0xcec08415 //sm4e v21.4s,v0.4s +.inst 0xcec08416 //sm4e v22.4s,v0.4s +.inst 0xcec08417 //sm4e v23.4s,v0.4s + +.inst 0xcec08434 //sm4e v20.4s,v1.4s +.inst 0xcec08435 //sm4e v21.4s,v1.4s +.inst 0xcec08436 //sm4e v22.4s,v1.4s +.inst 0xcec08437 //sm4e v23.4s,v1.4s + +.inst 0xcec08454 //sm4e v20.4s,v2.4s +.inst 0xcec08455 //sm4e v21.4s,v2.4s +.inst 0xcec08456 //sm4e v22.4s,v2.4s +.inst 0xcec08457 //sm4e v23.4s,v2.4s + +.inst 0xcec08474 //sm4e v20.4s,v3.4s +.inst 0xcec08475 //sm4e v21.4s,v3.4s +.inst 0xcec08476 //sm4e v22.4s,v3.4s +.inst 0xcec08477 //sm4e v23.4s,v3.4s + +.inst 0xcec08494 //sm4e v20.4s,v4.4s +.inst 0xcec08495 //sm4e v21.4s,v4.4s +.inst 0xcec08496 //sm4e v22.4s,v4.4s +.inst 0xcec08497 //sm4e v23.4s,v4.4s + +.inst 0xcec084b4 //sm4e v20.4s,v5.4s +.inst 0xcec084b5 //sm4e v21.4s,v5.4s +.inst 0xcec084b6 //sm4e v22.4s,v5.4s +.inst 0xcec084b7 //sm4e v23.4s,v5.4s + +.inst 0xcec084d4 //sm4e v20.4s,v6.4s +.inst 0xcec084d5 //sm4e v21.4s,v6.4s +.inst 0xcec084d6 //sm4e v22.4s,v6.4s +.inst 0xcec084d7 //sm4e v23.4s,v6.4s + +.inst 0xcec084f4 //sm4e v20.4s,v7.4s + rev64 v20.4S,v20.4S +.inst 0xcec084f5 //sm4e v21.4s,v7.4s + ext v20.16b,v20.16b,v20.16b,#8 + rev64 v21.4S,v21.4S +.inst 0xcec084f6 //sm4e v22.4s,v7.4s + ext v21.16b,v21.16b,v21.16b,#8 + rev64 v22.4S,v22.4S +.inst 0xcec084f7 //sm4e v23.4s,v7.4s + ext v22.16b,v22.16b,v22.16b,#8 + rev64 v23.4S,v23.4S + ext v23.16b,v23.16b,v23.16b,#8 +#ifndef __AARCH64EB__ + rev32 v16.16b,v16.16b +#endif +#ifndef __AARCH64EB__ + rev32 v17.16b,v17.16b +#endif +#ifndef __AARCH64EB__ + rev32 v18.16b,v18.16b +#endif +#ifndef __AARCH64EB__ + rev32 v19.16b,v19.16b +#endif +#ifndef __AARCH64EB__ + rev32 v20.16b,v20.16b +#endif +#ifndef __AARCH64EB__ + rev32 v21.16b,v21.16b +#endif +#ifndef __AARCH64EB__ + rev32 v22.16b,v22.16b +#endif +#ifndef __AARCH64EB__ + rev32 v23.16b,v23.16b +#endif + eor v16.16b,v16.16b,v24.16b + eor v17.16b,v17.16b,v25.16b + eor v18.16b,v18.16b,v26.16b + eor v19.16b,v19.16b,v27.16b + eor v20.16b,v20.16b,v28.16b + eor v21.16b,v21.16b,v29.16b + eor v22.16b,v22.16b,v30.16b + eor v23.16b,v23.16b,v31.16b + st1 {v16.4s,v17.4s,v18.4s,v19.4s},[x1],#64 + st1 {v20.4s,v21.4s,v22.4s,v23.4s},[x1],#64 + subs x2,x2,#8 + b.eq 3f + add w5,w5,#1 + mov v8.s[3],w5 + b 1b +2: +.inst 0xcec08410 //sm4e v16.4s,v0.4s +.inst 0xcec08411 //sm4e v17.4s,v0.4s +.inst 0xcec08412 //sm4e v18.4s,v0.4s +.inst 0xcec08413 //sm4e v19.4s,v0.4s + +.inst 0xcec08430 //sm4e v16.4s,v1.4s +.inst 0xcec08431 //sm4e v17.4s,v1.4s +.inst 0xcec08432 //sm4e v18.4s,v1.4s +.inst 0xcec08433 //sm4e v19.4s,v1.4s + +.inst 0xcec08450 //sm4e v16.4s,v2.4s +.inst 0xcec08451 //sm4e v17.4s,v2.4s +.inst 0xcec08452 //sm4e v18.4s,v2.4s +.inst 0xcec08453 //sm4e v19.4s,v2.4s + +.inst 0xcec08470 //sm4e v16.4s,v3.4s +.inst 0xcec08471 //sm4e v17.4s,v3.4s +.inst 0xcec08472 //sm4e v18.4s,v3.4s +.inst 0xcec08473 //sm4e v19.4s,v3.4s + +.inst 0xcec08490 //sm4e v16.4s,v4.4s +.inst 0xcec08491 //sm4e v17.4s,v4.4s +.inst 0xcec08492 //sm4e v18.4s,v4.4s +.inst 0xcec08493 //sm4e v19.4s,v4.4s + +.inst 0xcec084b0 //sm4e v16.4s,v5.4s +.inst 0xcec084b1 //sm4e v17.4s,v5.4s +.inst 0xcec084b2 //sm4e v18.4s,v5.4s +.inst 0xcec084b3 //sm4e v19.4s,v5.4s + +.inst 0xcec084d0 //sm4e v16.4s,v6.4s +.inst 0xcec084d1 //sm4e v17.4s,v6.4s +.inst 0xcec084d2 //sm4e v18.4s,v6.4s +.inst 0xcec084d3 //sm4e v19.4s,v6.4s + +.inst 0xcec084f0 //sm4e v16.4s,v7.4s + rev64 v16.4S,v16.4S +.inst 0xcec084f1 //sm4e v17.4s,v7.4s + ext v16.16b,v16.16b,v16.16b,#8 + rev64 v17.4S,v17.4S +.inst 0xcec084f2 //sm4e v18.4s,v7.4s + ext v17.16b,v17.16b,v17.16b,#8 + rev64 v18.4S,v18.4S +.inst 0xcec084f3 //sm4e v19.4s,v7.4s + ext v18.16b,v18.16b,v18.16b,#8 + rev64 v19.4S,v19.4S + ext v19.16b,v19.16b,v19.16b,#8 +#ifndef __AARCH64EB__ + rev32 v16.16b,v16.16b +#endif +#ifndef __AARCH64EB__ + rev32 v17.16b,v17.16b +#endif +#ifndef __AARCH64EB__ + rev32 v18.16b,v18.16b +#endif +#ifndef __AARCH64EB__ + rev32 v19.16b,v19.16b +#endif + eor v16.16b,v16.16b,v24.16b + eor v17.16b,v17.16b,v25.16b + eor v18.16b,v18.16b,v26.16b + eor v19.16b,v19.16b,v27.16b + st1 {v16.4s,v17.4s,v18.4s,v19.4s},[x1],#64 + subs x2,x2,#4 + b.eq 3f + add w5,w5,#1 + mov v8.s[3],w5 + b 1b +1: + subs x2,x2,#1 + b.lt 3f + mov v16.16b,v8.16b + ld1 {v24.4s},[x0],#16 +.inst 0xcec08410 //sm4e v16.4s,v0.4s +.inst 0xcec08430 //sm4e v16.4s,v1.4s +.inst 0xcec08450 //sm4e v16.4s,v2.4s +.inst 0xcec08470 //sm4e v16.4s,v3.4s +.inst 0xcec08490 //sm4e v16.4s,v4.4s +.inst 0xcec084b0 //sm4e v16.4s,v5.4s +.inst 0xcec084d0 //sm4e v16.4s,v6.4s +.inst 0xcec084f0 //sm4e v16.4s,v7.4s + rev64 v16.4S,v16.4S + ext v16.16b,v16.16b,v16.16b,#8 +#ifndef __AARCH64EB__ + rev32 v16.16b,v16.16b +#endif + eor v16.16b,v16.16b,v24.16b + st1 {v16.4s},[x1],#16 + b.eq 3f + add w5,w5,#1 + mov v8.s[3],w5 + b 1b +3: + ldp d8,d9,[sp],#16 + ret +.size sm4_v8_ctr32_encrypt_blocks,.-sm4_v8_ctr32_encrypt_blocks diff --git a/sys/crypto/openssl/aarch64/vpaes-armv8.S b/sys/crypto/openssl/aarch64/vpaes-armv8.S index c6338b00d5f6..09f0ba9a558f 100644 --- a/sys/crypto/openssl/aarch64/vpaes-armv8.S +++ b/sys/crypto/openssl/aarch64/vpaes-armv8.S @@ -1,7 +1,7 @@ /* Do not modify. This file is auto-generated from vpaes-armv8.pl. */ #include "arm_arch.h" -.text +.section .rodata .type _vpaes_consts,%object .align 7 // totally strategic alignment @@ -93,6 +93,9 @@ _vpaes_consts: .align 2 .size _vpaes_consts,.-_vpaes_consts .align 6 + +.text + // // _aes_preheat // @@ -102,7 +105,8 @@ _vpaes_consts: .type _vpaes_encrypt_preheat,%function .align 4 _vpaes_encrypt_preheat: - adr x10, .Lk_inv + adrp x10, .Lk_inv + add x10, x10, #:lo12:.Lk_inv movi v17.16b, #0x0f ld1 {v18.2d,v19.2d}, [x10],#32 // .Lk_inv ld1 {v20.2d,v21.2d,v22.2d,v23.2d}, [x10],#64 // .Lk_ipt, .Lk_sbo @@ -130,7 +134,8 @@ _vpaes_encrypt_preheat: _vpaes_encrypt_core: mov x9, x2 ldr w8, [x2,#240] // pull rounds - adr x11, .Lk_mc_forward+16 + adrp x11, .Lk_mc_forward+16 + add x11, x11, #:lo12:.Lk_mc_forward+16 // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 @@ -217,7 +222,8 @@ vpaes_encrypt: _vpaes_encrypt_2x: mov x9, x2 ldr w8, [x2,#240] // pull rounds - adr x11, .Lk_mc_forward+16 + adrp x11, .Lk_mc_forward+16 + add x11, x11, #:lo12:.Lk_mc_forward+16 // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 @@ -320,9 +326,11 @@ _vpaes_encrypt_2x: .type _vpaes_decrypt_preheat,%function .align 4 _vpaes_decrypt_preheat: - adr x10, .Lk_inv + adrp x10, .Lk_inv + add x10, x10, #:lo12:.Lk_inv movi v17.16b, #0x0f - adr x11, .Lk_dipt + adrp x11, .Lk_dipt + add x11, x11, #:lo12:.Lk_dipt ld1 {v18.2d,v19.2d}, [x10],#32 // .Lk_inv ld1 {v20.2d,v21.2d,v22.2d,v23.2d}, [x11],#64 // .Lk_dipt, .Lk_dsbo ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x11],#64 // .Lk_dsb9, .Lk_dsbd @@ -344,10 +352,12 @@ _vpaes_decrypt_core: // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo lsl x11, x8, #4 // mov %rax, %r11; shl $4, %r11 eor x11, x11, #0x30 // xor $0x30, %r11 - adr x10, .Lk_sr + adrp x10, .Lk_sr + add x10, x10, #:lo12:.Lk_sr and x11, x11, #0x30 // and $0x30, %r11 add x11, x11, x10 - adr x10, .Lk_mc_forward+48 + adrp x10, .Lk_mc_forward+48 + add x10, x10, #:lo12:.Lk_mc_forward+48 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 @@ -455,10 +465,12 @@ _vpaes_decrypt_2x: // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo lsl x11, x8, #4 // mov %rax, %r11; shl $4, %r11 eor x11, x11, #0x30 // xor $0x30, %r11 - adr x10, .Lk_sr + adrp x10, .Lk_sr + add x10, x10, #:lo12:.Lk_sr and x11, x11, #0x30 // and $0x30, %r11 add x11, x11, x10 - adr x10, .Lk_mc_forward+48 + adrp x10, .Lk_mc_forward+48 + add x10, x10, #:lo12:.Lk_mc_forward+48 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 @@ -587,14 +599,18 @@ _vpaes_decrypt_2x: .type _vpaes_key_preheat,%function .align 4 _vpaes_key_preheat: - adr x10, .Lk_inv + adrp x10, .Lk_inv + add x10, x10, #:lo12:.Lk_inv movi v16.16b, #0x5b // .Lk_s63 - adr x11, .Lk_sb1 + adrp x11, .Lk_sb1 + add x11, x11, #:lo12:.Lk_sb1 movi v17.16b, #0x0f // .Lk_s0F ld1 {v18.2d,v19.2d,v20.2d,v21.2d}, [x10] // .Lk_inv, .Lk_ipt - adr x10, .Lk_dksd + adrp x10, .Lk_dksd + add x10, x10, #:lo12:.Lk_dksd ld1 {v22.2d,v23.2d}, [x11] // .Lk_sb1 - adr x11, .Lk_mc_forward + adrp x11, .Lk_mc_forward + add x11, x11, #:lo12:.Lk_mc_forward ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10],#64 // .Lk_dksd, .Lk_dksb ld1 {v28.2d,v29.2d,v30.2d,v31.2d}, [x10],#64 // .Lk_dkse, .Lk_dks9 ld1 {v8.2d}, [x10] // .Lk_rcon @@ -618,7 +634,8 @@ _vpaes_schedule_core: bl _vpaes_schedule_transform mov v7.16b, v0.16b // vmovdqa %xmm0, %xmm7 - adr x10, .Lk_sr // lea .Lk_sr(%rip),%r10 + adrp x10, .Lk_sr + add x10, x10, #:lo12:.Lk_sr add x8, x8, x10 cbnz w3, .Lschedule_am_decrypting @@ -744,12 +761,14 @@ _vpaes_schedule_core: .align 4 .Lschedule_mangle_last: // schedule last round key from xmm0 - adr x11, .Lk_deskew // lea .Lk_deskew(%rip),%r11 # prepare to deskew + adrp x11, .Lk_deskew + add x11, x11, #:lo12:.Lk_deskew cbnz w3, .Lschedule_mangle_last_dec // encrypting ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10),%xmm1 - adr x11, .Lk_opt // lea .Lk_opt(%rip), %r11 # prepare to output transform + adrp x11, .Lk_opt + add x11, x11, #:lo12:.Lk_opt add x2, x2, #32 // add $32, %rdx tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 # output permute diff --git a/sys/crypto/openssl/aarch64/vpsm4-armv8.S b/sys/crypto/openssl/aarch64/vpsm4-armv8.S new file mode 100644 index 000000000000..830e0315a2be --- /dev/null +++ b/sys/crypto/openssl/aarch64/vpsm4-armv8.S @@ -0,0 +1,5021 @@ +/* Do not modify. This file is auto-generated from vpsm4-armv8.pl. */ +// Copyright 2020-2025 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License 2.0 (the "License"). You may not use +// this file except in compliance with the License. You can obtain a copy +// in the file LICENSE in the source distribution or at +// https://www.openssl.org/source/license.html + +// +// This module implements SM4 with ASIMD on aarch64 +// +// Feb 2022 +// + +// $output is the last argument if it looks like a file (it has an extension) +// $flavour is the first argument if it doesn't look like a file +#include "arm_arch.h" +.arch armv8-a +.text + +.section .rodata +.type _vpsm4_consts,%object +.align 7 +_vpsm4_consts: +.Lsbox: +.byte 0xD6,0x90,0xE9,0xFE,0xCC,0xE1,0x3D,0xB7,0x16,0xB6,0x14,0xC2,0x28,0xFB,0x2C,0x05 +.byte 0x2B,0x67,0x9A,0x76,0x2A,0xBE,0x04,0xC3,0xAA,0x44,0x13,0x26,0x49,0x86,0x06,0x99 +.byte 0x9C,0x42,0x50,0xF4,0x91,0xEF,0x98,0x7A,0x33,0x54,0x0B,0x43,0xED,0xCF,0xAC,0x62 +.byte 0xE4,0xB3,0x1C,0xA9,0xC9,0x08,0xE8,0x95,0x80,0xDF,0x94,0xFA,0x75,0x8F,0x3F,0xA6 +.byte 0x47,0x07,0xA7,0xFC,0xF3,0x73,0x17,0xBA,0x83,0x59,0x3C,0x19,0xE6,0x85,0x4F,0xA8 +.byte 0x68,0x6B,0x81,0xB2,0x71,0x64,0xDA,0x8B,0xF8,0xEB,0x0F,0x4B,0x70,0x56,0x9D,0x35 +.byte 0x1E,0x24,0x0E,0x5E,0x63,0x58,0xD1,0xA2,0x25,0x22,0x7C,0x3B,0x01,0x21,0x78,0x87 +.byte 0xD4,0x00,0x46,0x57,0x9F,0xD3,0x27,0x52,0x4C,0x36,0x02,0xE7,0xA0,0xC4,0xC8,0x9E +.byte 0xEA,0xBF,0x8A,0xD2,0x40,0xC7,0x38,0xB5,0xA3,0xF7,0xF2,0xCE,0xF9,0x61,0x15,0xA1 +.byte 0xE0,0xAE,0x5D,0xA4,0x9B,0x34,0x1A,0x55,0xAD,0x93,0x32,0x30,0xF5,0x8C,0xB1,0xE3 +.byte 0x1D,0xF6,0xE2,0x2E,0x82,0x66,0xCA,0x60,0xC0,0x29,0x23,0xAB,0x0D,0x53,0x4E,0x6F +.byte 0xD5,0xDB,0x37,0x45,0xDE,0xFD,0x8E,0x2F,0x03,0xFF,0x6A,0x72,0x6D,0x6C,0x5B,0x51 +.byte 0x8D,0x1B,0xAF,0x92,0xBB,0xDD,0xBC,0x7F,0x11,0xD9,0x5C,0x41,0x1F,0x10,0x5A,0xD8 +.byte 0x0A,0xC1,0x31,0x88,0xA5,0xCD,0x7B,0xBD,0x2D,0x74,0xD0,0x12,0xB8,0xE5,0xB4,0xB0 +.byte 0x89,0x69,0x97,0x4A,0x0C,0x96,0x77,0x7E,0x65,0xB9,0xF1,0x09,0xC5,0x6E,0xC6,0x84 +.byte 0x18,0xF0,0x7D,0xEC,0x3A,0xDC,0x4D,0x20,0x79,0xEE,0x5F,0x3E,0xD7,0xCB,0x39,0x48 +.Lck: +.long 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269 +.long 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9 +.long 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249 +.long 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9 +.long 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229 +.long 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299 +.long 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209 +.long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279 +.Lfk: +.quad 0x56aa3350a3b1bac6,0xb27022dc677d9197 +.Lshuffles: +.quad 0x0B0A090807060504,0x030201000F0E0D0C +.Lxts_magic: +.quad 0x0101010101010187,0x0101010101010101 + +.size _vpsm4_consts,.-_vpsm4_consts + +.previous + +.type _vpsm4_set_key,%function +.align 4 +_vpsm4_set_key: + AARCH64_VALID_CALL_TARGET + ld1 {v5.4s},[x0] + adrp x10,.Lsbox + add x10,x10,#:lo12:.Lsbox + ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x10],#64 + ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x10],#64 + ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x10],#64 + ld1 {v28.16b,v29.16b,v30.16b,v31.16b},[x10] +#ifndef __AARCH64EB__ + rev32 v5.16b,v5.16b +#endif + adrp x5,.Lshuffles + add x5,x5,#:lo12:.Lshuffles + ld1 {v7.2d},[x5] + adrp x5,.Lfk + add x5,x5,#:lo12:.Lfk + ld1 {v6.2d},[x5] + eor v5.16b,v5.16b,v6.16b + mov x6,#32 + adrp x5,.Lck + add x5,x5,#:lo12:.Lck + movi v0.16b,#64 + cbnz w2,1f + add x1,x1,124 +1: + mov w7,v5.s[1] + ldr w8,[x5],#4 + eor w8,w8,w7 + mov w7,v5.s[2] + eor w8,w8,w7 + mov w7,v5.s[3] + eor w8,w8,w7 + // sbox lookup + mov v4.s[0],w8 + tbl v1.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v4.16b + sub v4.16b,v4.16b,v0.16b + tbx v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v4.16b + sub v4.16b,v4.16b,v0.16b + tbx v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v4.16b + sub v4.16b,v4.16b,v0.16b + tbx v1.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v4.16b + mov w7,v1.s[0] + eor w8,w7,w7,ror #19 + eor w8,w8,w7,ror #9 + mov w7,v5.s[0] + eor w8,w8,w7 + mov v5.s[0],w8 + cbz w2,2f + str w8,[x1],#4 + b 3f +2: + str w8,[x1],#-4 +3: + tbl v5.16b,{v5.16b},v7.16b + subs x6,x6,#1 + b.ne 1b + ret +.size _vpsm4_set_key,.-_vpsm4_set_key +.type _vpsm4_enc_4blks,%function +.align 4 +_vpsm4_enc_4blks: + AARCH64_VALID_CALL_TARGET + mov x10,x3 + mov w11,#8 +10: + ldp w7,w8,[x10],8 + dup v12.4s,w7 + dup v13.4s,w8 + + // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) + eor v14.16b,v6.16b,v7.16b + eor v12.16b,v5.16b,v12.16b + eor v12.16b,v14.16b,v12.16b + movi v0.16b,#64 + movi v1.16b,#128 + movi v2.16b,#192 + sub v0.16b,v12.16b,v0.16b + sub v1.16b,v12.16b,v1.16b + sub v2.16b,v12.16b,v2.16b + tbl v12.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v12.16b + tbl v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b + tbl v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b + tbl v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b + add v0.2d,v0.2d,v1.2d + add v2.2d,v2.2d,v12.2d + add v12.2d,v0.2d,v2.2d + + ushr v0.4s,v12.4s,32-2 + sli v0.4s,v12.4s,2 + ushr v2.4s,v12.4s,32-10 + eor v1.16b,v0.16b,v12.16b + sli v2.4s,v12.4s,10 + eor v1.16b,v2.16b,v1.16b + ushr v0.4s,v12.4s,32-18 + sli v0.4s,v12.4s,18 + ushr v2.4s,v12.4s,32-24 + eor v1.16b,v0.16b,v1.16b + sli v2.4s,v12.4s,24 + eor v12.16b,v2.16b,v1.16b + eor v4.16b,v4.16b,v12.16b + + // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) + eor v14.16b,v14.16b,v4.16b + eor v13.16b,v14.16b,v13.16b + movi v0.16b,#64 + movi v1.16b,#128 + movi v2.16b,#192 + sub v0.16b,v13.16b,v0.16b + sub v1.16b,v13.16b,v1.16b + sub v2.16b,v13.16b,v2.16b + tbl v13.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v13.16b + tbl v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b + tbl v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b + tbl v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b + add v0.2d,v0.2d,v1.2d + add v2.2d,v2.2d,v13.2d + add v13.2d,v0.2d,v2.2d + + ushr v0.4s,v13.4s,32-2 + sli v0.4s,v13.4s,2 + ushr v2.4s,v13.4s,32-10 + eor v1.16b,v0.16b,v13.16b + sli v2.4s,v13.4s,10 + eor v1.16b,v2.16b,v1.16b + ushr v0.4s,v13.4s,32-18 + sli v0.4s,v13.4s,18 + ushr v2.4s,v13.4s,32-24 + eor v1.16b,v0.16b,v1.16b + sli v2.4s,v13.4s,24 + eor v13.16b,v2.16b,v1.16b + ldp w7,w8,[x10],8 + eor v5.16b,v5.16b,v13.16b + + dup v12.4s,w7 + dup v13.4s,w8 + + // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) + eor v14.16b,v4.16b,v5.16b + eor v12.16b,v7.16b,v12.16b + eor v12.16b,v14.16b,v12.16b + movi v0.16b,#64 + movi v1.16b,#128 + movi v2.16b,#192 + sub v0.16b,v12.16b,v0.16b + sub v1.16b,v12.16b,v1.16b + sub v2.16b,v12.16b,v2.16b + tbl v12.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v12.16b + tbl v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b + tbl v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b + tbl v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b + add v0.2d,v0.2d,v1.2d + add v2.2d,v2.2d,v12.2d + add v12.2d,v0.2d,v2.2d + + ushr v0.4s,v12.4s,32-2 + sli v0.4s,v12.4s,2 + ushr v2.4s,v12.4s,32-10 + eor v1.16b,v0.16b,v12.16b + sli v2.4s,v12.4s,10 + eor v1.16b,v2.16b,v1.16b + ushr v0.4s,v12.4s,32-18 + sli v0.4s,v12.4s,18 + ushr v2.4s,v12.4s,32-24 + eor v1.16b,v0.16b,v1.16b + sli v2.4s,v12.4s,24 + eor v12.16b,v2.16b,v1.16b + eor v6.16b,v6.16b,v12.16b + + // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) + eor v14.16b,v14.16b,v6.16b + eor v13.16b,v14.16b,v13.16b + movi v0.16b,#64 + movi v1.16b,#128 + movi v2.16b,#192 + sub v0.16b,v13.16b,v0.16b + sub v1.16b,v13.16b,v1.16b + sub v2.16b,v13.16b,v2.16b + tbl v13.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v13.16b + tbl v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b + tbl v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b + tbl v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b + add v0.2d,v0.2d,v1.2d + add v2.2d,v2.2d,v13.2d + add v13.2d,v0.2d,v2.2d + + ushr v0.4s,v13.4s,32-2 + sli v0.4s,v13.4s,2 + ushr v2.4s,v13.4s,32-10 + eor v1.16b,v0.16b,v13.16b + sli v2.4s,v13.4s,10 + eor v1.16b,v2.16b,v1.16b + ushr v0.4s,v13.4s,32-18 + sli v0.4s,v13.4s,18 + ushr v2.4s,v13.4s,32-24 + eor v1.16b,v0.16b,v1.16b + sli v2.4s,v13.4s,24 + eor v13.16b,v2.16b,v1.16b + eor v7.16b,v7.16b,v13.16b + subs w11,w11,#1 + b.ne 10b +#ifndef __AARCH64EB__ + rev32 v3.16b,v4.16b +#else + mov v3.16b,v4.16b +#endif +#ifndef __AARCH64EB__ + rev32 v2.16b,v5.16b +#else + mov v2.16b,v5.16b +#endif +#ifndef __AARCH64EB__ + rev32 v1.16b,v6.16b +#else + mov v1.16b,v6.16b +#endif +#ifndef __AARCH64EB__ + rev32 v0.16b,v7.16b +#else + mov v0.16b,v7.16b +#endif + ret +.size _vpsm4_enc_4blks,.-_vpsm4_enc_4blks +.type _vpsm4_enc_8blks,%function +.align 4 +_vpsm4_enc_8blks: + AARCH64_VALID_CALL_TARGET + mov x10,x3 + mov w11,#8 +10: + ldp w7,w8,[x10],8 + // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) + dup v12.4s,w7 + eor v14.16b,v6.16b,v7.16b + eor v15.16b,v10.16b,v11.16b + eor v0.16b,v5.16b,v12.16b + eor v1.16b,v9.16b,v12.16b + eor v12.16b,v14.16b,v0.16b + eor v13.16b,v15.16b,v1.16b + movi v3.16b,#64 + sub v0.16b,v12.16b,v3.16b + sub v1.16b,v0.16b,v3.16b + sub v2.16b,v1.16b,v3.16b + tbl v12.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v12.16b + tbl v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b + tbl v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b + tbl v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b + add v1.2d,v0.2d,v1.2d + add v12.2d,v2.2d,v12.2d + add v12.2d,v1.2d,v12.2d + + sub v0.16b,v13.16b,v3.16b + sub v1.16b,v0.16b,v3.16b + sub v2.16b,v1.16b,v3.16b + tbl v13.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v13.16b + tbl v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b + tbl v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b + tbl v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b + add v1.2d,v0.2d,v1.2d + add v13.2d,v2.2d,v13.2d + add v13.2d,v1.2d,v13.2d + + ushr v0.4s,v12.4s,32-2 + sli v0.4s,v12.4s,2 + ushr v2.4s,v13.4s,32-2 + eor v1.16b,v0.16b,v12.16b + sli v2.4s,v13.4s,2 + + ushr v0.4s,v12.4s,32-10 + eor v3.16b,v2.16b,v13.16b + sli v0.4s,v12.4s,10 + ushr v2.4s,v13.4s,32-10 + eor v1.16b,v0.16b,v1.16b + sli v2.4s,v13.4s,10 + + ushr v0.4s,v12.4s,32-18 + eor v3.16b,v2.16b,v3.16b + sli v0.4s,v12.4s,18 + ushr v2.4s,v13.4s,32-18 + eor v1.16b,v0.16b,v1.16b + sli v2.4s,v13.4s,18 + + ushr v0.4s,v12.4s,32-24 + eor v3.16b,v2.16b,v3.16b + sli v0.4s,v12.4s,24 + ushr v2.4s,v13.4s,32-24 + eor v12.16b,v0.16b,v1.16b + sli v2.4s,v13.4s,24 + eor v13.16b,v2.16b,v3.16b + eor v4.16b,v4.16b,v12.16b + eor v8.16b,v8.16b,v13.16b + + // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) + dup v13.4s,w8 + eor v14.16b,v14.16b,v4.16b + eor v15.16b,v15.16b,v8.16b + eor v12.16b,v14.16b,v13.16b + eor v13.16b,v15.16b,v13.16b + movi v3.16b,#64 + sub v0.16b,v12.16b,v3.16b + sub v1.16b,v0.16b,v3.16b + sub v2.16b,v1.16b,v3.16b + tbl v12.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v12.16b + tbl v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b + tbl v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b + tbl v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b + add v1.2d,v0.2d,v1.2d + add v12.2d,v2.2d,v12.2d + add v12.2d,v1.2d,v12.2d + + sub v0.16b,v13.16b,v3.16b + sub v1.16b,v0.16b,v3.16b + sub v2.16b,v1.16b,v3.16b + tbl v13.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v13.16b + tbl v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b + tbl v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b + tbl v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b + add v1.2d,v0.2d,v1.2d + add v13.2d,v2.2d,v13.2d + add v13.2d,v1.2d,v13.2d + + ushr v0.4s,v12.4s,32-2 + sli v0.4s,v12.4s,2 + ushr v2.4s,v13.4s,32-2 + eor v1.16b,v0.16b,v12.16b + sli v2.4s,v13.4s,2 + + ushr v0.4s,v12.4s,32-10 + eor v3.16b,v2.16b,v13.16b + sli v0.4s,v12.4s,10 + ushr v2.4s,v13.4s,32-10 + eor v1.16b,v0.16b,v1.16b + sli v2.4s,v13.4s,10 + + ushr v0.4s,v12.4s,32-18 + eor v3.16b,v2.16b,v3.16b + sli v0.4s,v12.4s,18 + ushr v2.4s,v13.4s,32-18 + eor v1.16b,v0.16b,v1.16b + sli v2.4s,v13.4s,18 + + ushr v0.4s,v12.4s,32-24 + eor v3.16b,v2.16b,v3.16b + sli v0.4s,v12.4s,24 + ushr v2.4s,v13.4s,32-24 + eor v12.16b,v0.16b,v1.16b + sli v2.4s,v13.4s,24 + eor v13.16b,v2.16b,v3.16b + ldp w7,w8,[x10],8 + eor v5.16b,v5.16b,v12.16b + eor v9.16b,v9.16b,v13.16b + + // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) + dup v12.4s,w7 + eor v14.16b,v4.16b,v5.16b + eor v15.16b,v8.16b,v9.16b + eor v0.16b,v7.16b,v12.16b + eor v1.16b,v11.16b,v12.16b + eor v12.16b,v14.16b,v0.16b + eor v13.16b,v15.16b,v1.16b + movi v3.16b,#64 + sub v0.16b,v12.16b,v3.16b + sub v1.16b,v0.16b,v3.16b + sub v2.16b,v1.16b,v3.16b + tbl v12.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v12.16b + tbl v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b + tbl v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b + tbl v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b + add v1.2d,v0.2d,v1.2d + add v12.2d,v2.2d,v12.2d + add v12.2d,v1.2d,v12.2d + + sub v0.16b,v13.16b,v3.16b + sub v1.16b,v0.16b,v3.16b + sub v2.16b,v1.16b,v3.16b + tbl v13.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v13.16b + tbl v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b + tbl v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b + tbl v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b + add v1.2d,v0.2d,v1.2d + add v13.2d,v2.2d,v13.2d + add v13.2d,v1.2d,v13.2d + + ushr v0.4s,v12.4s,32-2 + sli v0.4s,v12.4s,2 + ushr v2.4s,v13.4s,32-2 + eor v1.16b,v0.16b,v12.16b + sli v2.4s,v13.4s,2 + + ushr v0.4s,v12.4s,32-10 + eor v3.16b,v2.16b,v13.16b + sli v0.4s,v12.4s,10 + ushr v2.4s,v13.4s,32-10 + eor v1.16b,v0.16b,v1.16b + sli v2.4s,v13.4s,10 + + ushr v0.4s,v12.4s,32-18 + eor v3.16b,v2.16b,v3.16b + sli v0.4s,v12.4s,18 + ushr v2.4s,v13.4s,32-18 + eor v1.16b,v0.16b,v1.16b + sli v2.4s,v13.4s,18 + + ushr v0.4s,v12.4s,32-24 + eor v3.16b,v2.16b,v3.16b + sli v0.4s,v12.4s,24 + ushr v2.4s,v13.4s,32-24 + eor v12.16b,v0.16b,v1.16b + sli v2.4s,v13.4s,24 + eor v13.16b,v2.16b,v3.16b + eor v6.16b,v6.16b,v12.16b + eor v10.16b,v10.16b,v13.16b + + // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) + dup v13.4s,w8 + eor v14.16b,v14.16b,v6.16b + eor v15.16b,v15.16b,v10.16b + eor v12.16b,v14.16b,v13.16b + eor v13.16b,v15.16b,v13.16b + movi v3.16b,#64 + sub v0.16b,v12.16b,v3.16b + sub v1.16b,v0.16b,v3.16b + sub v2.16b,v1.16b,v3.16b + tbl v12.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v12.16b + tbl v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b + tbl v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b + tbl v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b + add v1.2d,v0.2d,v1.2d + add v12.2d,v2.2d,v12.2d + add v12.2d,v1.2d,v12.2d + + sub v0.16b,v13.16b,v3.16b + sub v1.16b,v0.16b,v3.16b + sub v2.16b,v1.16b,v3.16b + tbl v13.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v13.16b + tbl v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b + tbl v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b + tbl v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b + add v1.2d,v0.2d,v1.2d + add v13.2d,v2.2d,v13.2d + add v13.2d,v1.2d,v13.2d + + ushr v0.4s,v12.4s,32-2 + sli v0.4s,v12.4s,2 + ushr v2.4s,v13.4s,32-2 + eor v1.16b,v0.16b,v12.16b + sli v2.4s,v13.4s,2 + + ushr v0.4s,v12.4s,32-10 + eor v3.16b,v2.16b,v13.16b + sli v0.4s,v12.4s,10 + ushr v2.4s,v13.4s,32-10 + eor v1.16b,v0.16b,v1.16b + sli v2.4s,v13.4s,10 + + ushr v0.4s,v12.4s,32-18 + eor v3.16b,v2.16b,v3.16b + sli v0.4s,v12.4s,18 + ushr v2.4s,v13.4s,32-18 + eor v1.16b,v0.16b,v1.16b + sli v2.4s,v13.4s,18 + + ushr v0.4s,v12.4s,32-24 + eor v3.16b,v2.16b,v3.16b + sli v0.4s,v12.4s,24 + ushr v2.4s,v13.4s,32-24 + eor v12.16b,v0.16b,v1.16b + sli v2.4s,v13.4s,24 + eor v13.16b,v2.16b,v3.16b + eor v7.16b,v7.16b,v12.16b + eor v11.16b,v11.16b,v13.16b + subs w11,w11,#1 + b.ne 10b +#ifndef __AARCH64EB__ + rev32 v3.16b,v4.16b +#else + mov v3.16b,v4.16b +#endif +#ifndef __AARCH64EB__ + rev32 v2.16b,v5.16b +#else + mov v2.16b,v5.16b +#endif +#ifndef __AARCH64EB__ + rev32 v1.16b,v6.16b +#else + mov v1.16b,v6.16b +#endif +#ifndef __AARCH64EB__ + rev32 v0.16b,v7.16b +#else + mov v0.16b,v7.16b +#endif +#ifndef __AARCH64EB__ + rev32 v7.16b,v8.16b +#else + mov v7.16b,v8.16b +#endif +#ifndef __AARCH64EB__ + rev32 v6.16b,v9.16b +#else + mov v6.16b,v9.16b +#endif +#ifndef __AARCH64EB__ + rev32 v5.16b,v10.16b +#else + mov v5.16b,v10.16b +#endif +#ifndef __AARCH64EB__ + rev32 v4.16b,v11.16b +#else + mov v4.16b,v11.16b +#endif + ret +.size _vpsm4_enc_8blks,.-_vpsm4_enc_8blks +.globl vpsm4_set_encrypt_key +.type vpsm4_set_encrypt_key,%function +.align 5 +vpsm4_set_encrypt_key: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + mov w2,1 + bl _vpsm4_set_key + ldp x29,x30,[sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size vpsm4_set_encrypt_key,.-vpsm4_set_encrypt_key +.globl vpsm4_set_decrypt_key +.type vpsm4_set_decrypt_key,%function +.align 5 +vpsm4_set_decrypt_key: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + mov w2,0 + bl _vpsm4_set_key + ldp x29,x30,[sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size vpsm4_set_decrypt_key,.-vpsm4_set_decrypt_key +.globl vpsm4_encrypt +.type vpsm4_encrypt,%function +.align 5 +vpsm4_encrypt: + AARCH64_VALID_CALL_TARGET + ld1 {v4.4s},[x0] + adrp x10,.Lsbox + add x10,x10,#:lo12:.Lsbox + ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x10],#64 + ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x10],#64 + ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x10],#64 + ld1 {v28.16b,v29.16b,v30.16b,v31.16b},[x10] +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif + mov x3,x2 + mov x10,x3 + mov w11,#8 + mov w12,v4.s[0] + mov w13,v4.s[1] + mov w14,v4.s[2] + mov w15,v4.s[3] +10: + ldp w7,w8,[x10],8 + // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) + eor w6,w14,w15 + eor w9,w7,w13 + eor w6,w6,w9 + movi v1.16b,#64 + movi v2.16b,#128 + movi v3.16b,#192 + mov v0.s[0],w6 + + sub v1.16b,v0.16b,v1.16b + sub v2.16b,v0.16b,v2.16b + sub v3.16b,v0.16b,v3.16b + + tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b + tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b + tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b + tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b + + mov w6,v0.s[0] + mov w7,v1.s[0] + mov w9,v2.s[0] + add w7,w6,w7 + mov w6,v3.s[0] + add w7,w7,w9 + add w7,w7,w6 + + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w12,w12,w6 + // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) + eor w6,w14,w15 + eor w9,w12,w8 + eor w6,w6,w9 + movi v1.16b,#64 + movi v2.16b,#128 + movi v3.16b,#192 + mov v0.s[0],w6 + + sub v1.16b,v0.16b,v1.16b + sub v2.16b,v0.16b,v2.16b + sub v3.16b,v0.16b,v3.16b + + tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b + tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b + tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b + tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b + + mov w6,v0.s[0] + mov w7,v1.s[0] + mov w9,v2.s[0] + add w7,w6,w7 + mov w6,v3.s[0] + add w7,w7,w9 + add w7,w7,w6 + + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + ldp w7,w8,[x10],8 + eor w13,w13,w6 + // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) + eor w6,w12,w13 + eor w9,w7,w15 + eor w6,w6,w9 + movi v1.16b,#64 + movi v2.16b,#128 + movi v3.16b,#192 + mov v0.s[0],w6 + + sub v1.16b,v0.16b,v1.16b + sub v2.16b,v0.16b,v2.16b + sub v3.16b,v0.16b,v3.16b + + tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b + tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b + tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b + tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b + + mov w6,v0.s[0] + mov w7,v1.s[0] + mov w9,v2.s[0] + add w7,w6,w7 + mov w6,v3.s[0] + add w7,w7,w9 + add w7,w7,w6 + + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w14,w14,w6 + // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) + eor w6,w12,w13 + eor w9,w14,w8 + eor w6,w6,w9 + movi v1.16b,#64 + movi v2.16b,#128 + movi v3.16b,#192 + mov v0.s[0],w6 + + sub v1.16b,v0.16b,v1.16b + sub v2.16b,v0.16b,v2.16b + sub v3.16b,v0.16b,v3.16b + + tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b + tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b + tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b + tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b + + mov w6,v0.s[0] + mov w7,v1.s[0] + mov w9,v2.s[0] + add w7,w6,w7 + mov w6,v3.s[0] + add w7,w7,w9 + add w7,w7,w6 + + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w15,w15,w6 + subs w11,w11,#1 + b.ne 10b + mov v4.s[0],w15 + mov v4.s[1],w14 + mov v4.s[2],w13 + mov v4.s[3],w12 +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif + st1 {v4.4s},[x1] + ret +.size vpsm4_encrypt,.-vpsm4_encrypt +.globl vpsm4_decrypt +.type vpsm4_decrypt,%function +.align 5 +vpsm4_decrypt: + AARCH64_VALID_CALL_TARGET + ld1 {v4.4s},[x0] + adrp x10,.Lsbox + add x10,x10,#:lo12:.Lsbox + ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x10],#64 + ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x10],#64 + ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x10],#64 + ld1 {v28.16b,v29.16b,v30.16b,v31.16b},[x10] +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif + mov x3,x2 + mov x10,x3 + mov w11,#8 + mov w12,v4.s[0] + mov w13,v4.s[1] + mov w14,v4.s[2] + mov w15,v4.s[3] +10: + ldp w7,w8,[x10],8 + // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) + eor w6,w14,w15 + eor w9,w7,w13 + eor w6,w6,w9 + movi v1.16b,#64 + movi v2.16b,#128 + movi v3.16b,#192 + mov v0.s[0],w6 + + sub v1.16b,v0.16b,v1.16b + sub v2.16b,v0.16b,v2.16b + sub v3.16b,v0.16b,v3.16b + + tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b + tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b + tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b + tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b + + mov w6,v0.s[0] + mov w7,v1.s[0] + mov w9,v2.s[0] + add w7,w6,w7 + mov w6,v3.s[0] + add w7,w7,w9 + add w7,w7,w6 + + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w12,w12,w6 + // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) + eor w6,w14,w15 + eor w9,w12,w8 + eor w6,w6,w9 + movi v1.16b,#64 + movi v2.16b,#128 + movi v3.16b,#192 + mov v0.s[0],w6 + + sub v1.16b,v0.16b,v1.16b + sub v2.16b,v0.16b,v2.16b + sub v3.16b,v0.16b,v3.16b + + tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b + tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b + tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b + tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b + + mov w6,v0.s[0] + mov w7,v1.s[0] + mov w9,v2.s[0] + add w7,w6,w7 + mov w6,v3.s[0] + add w7,w7,w9 + add w7,w7,w6 + + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + ldp w7,w8,[x10],8 + eor w13,w13,w6 + // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) + eor w6,w12,w13 + eor w9,w7,w15 + eor w6,w6,w9 + movi v1.16b,#64 + movi v2.16b,#128 + movi v3.16b,#192 + mov v0.s[0],w6 + + sub v1.16b,v0.16b,v1.16b + sub v2.16b,v0.16b,v2.16b + sub v3.16b,v0.16b,v3.16b + + tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b + tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b + tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b + tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b + + mov w6,v0.s[0] + mov w7,v1.s[0] + mov w9,v2.s[0] + add w7,w6,w7 + mov w6,v3.s[0] + add w7,w7,w9 + add w7,w7,w6 + + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w14,w14,w6 + // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) + eor w6,w12,w13 + eor w9,w14,w8 + eor w6,w6,w9 + movi v1.16b,#64 + movi v2.16b,#128 + movi v3.16b,#192 + mov v0.s[0],w6 + + sub v1.16b,v0.16b,v1.16b + sub v2.16b,v0.16b,v2.16b + sub v3.16b,v0.16b,v3.16b + + tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b + tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b + tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b + tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b + + mov w6,v0.s[0] + mov w7,v1.s[0] + mov w9,v2.s[0] + add w7,w6,w7 + mov w6,v3.s[0] + add w7,w7,w9 + add w7,w7,w6 + + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w15,w15,w6 + subs w11,w11,#1 + b.ne 10b + mov v4.s[0],w15 + mov v4.s[1],w14 + mov v4.s[2],w13 + mov v4.s[3],w12 +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif + st1 {v4.4s},[x1] + ret +.size vpsm4_decrypt,.-vpsm4_decrypt +.globl vpsm4_ecb_encrypt +.type vpsm4_ecb_encrypt,%function +.align 5 +vpsm4_ecb_encrypt: + AARCH64_SIGN_LINK_REGISTER + // convert length into blocks + lsr x2,x2,4 + stp d8,d9,[sp,#-80]! + stp d10,d11,[sp,#16] + stp d12,d13,[sp,#32] + stp d14,d15,[sp,#48] + stp x29,x30,[sp,#64] + adrp x10,.Lsbox + add x10,x10,#:lo12:.Lsbox + ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x10],#64 + ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x10],#64 + ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x10],#64 + ld1 {v28.16b,v29.16b,v30.16b,v31.16b},[x10] +.Lecb_8_blocks_process: + cmp w2,#8 + b.lt .Lecb_4_blocks_process + ld4 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64 + ld4 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64 +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif +#ifndef __AARCH64EB__ + rev32 v5.16b,v5.16b +#endif +#ifndef __AARCH64EB__ + rev32 v6.16b,v6.16b +#endif +#ifndef __AARCH64EB__ + rev32 v7.16b,v7.16b +#endif +#ifndef __AARCH64EB__ + rev32 v8.16b,v8.16b +#endif +#ifndef __AARCH64EB__ + rev32 v9.16b,v9.16b +#endif +#ifndef __AARCH64EB__ + rev32 v10.16b,v10.16b +#endif +#ifndef __AARCH64EB__ + rev32 v11.16b,v11.16b +#endif + bl _vpsm4_enc_8blks + st4 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64 + st4 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64 + subs w2,w2,#8 + b.gt .Lecb_8_blocks_process + b 100f +.Lecb_4_blocks_process: + cmp w2,#4 + b.lt 1f + ld4 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64 +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif +#ifndef __AARCH64EB__ + rev32 v5.16b,v5.16b +#endif +#ifndef __AARCH64EB__ + rev32 v6.16b,v6.16b +#endif +#ifndef __AARCH64EB__ + rev32 v7.16b,v7.16b +#endif + bl _vpsm4_enc_4blks + st4 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64 + sub w2,w2,#4 +1: + // process last block + cmp w2,#1 + b.lt 100f + b.gt 1f + ld1 {v4.4s},[x0] +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif + mov x10,x3 + mov w11,#8 + mov w12,v4.s[0] + mov w13,v4.s[1] + mov w14,v4.s[2] + mov w15,v4.s[3] +10: + ldp w7,w8,[x10],8 + // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) + eor w6,w14,w15 + eor w9,w7,w13 + eor w6,w6,w9 + movi v1.16b,#64 + movi v2.16b,#128 + movi v3.16b,#192 + mov v0.s[0],w6 + + sub v1.16b,v0.16b,v1.16b + sub v2.16b,v0.16b,v2.16b + sub v3.16b,v0.16b,v3.16b + + tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b + tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b + tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b + tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b + + mov w6,v0.s[0] + mov w7,v1.s[0] + mov w9,v2.s[0] + add w7,w6,w7 + mov w6,v3.s[0] + add w7,w7,w9 + add w7,w7,w6 + + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w12,w12,w6 + // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) + eor w6,w14,w15 + eor w9,w12,w8 + eor w6,w6,w9 + movi v1.16b,#64 + movi v2.16b,#128 + movi v3.16b,#192 + mov v0.s[0],w6 + + sub v1.16b,v0.16b,v1.16b + sub v2.16b,v0.16b,v2.16b + sub v3.16b,v0.16b,v3.16b + + tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b + tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b + tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b + tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b + + mov w6,v0.s[0] + mov w7,v1.s[0] + mov w9,v2.s[0] + add w7,w6,w7 + mov w6,v3.s[0] + add w7,w7,w9 + add w7,w7,w6 + + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + ldp w7,w8,[x10],8 + eor w13,w13,w6 + // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) + eor w6,w12,w13 + eor w9,w7,w15 + eor w6,w6,w9 + movi v1.16b,#64 + movi v2.16b,#128 + movi v3.16b,#192 + mov v0.s[0],w6 + + sub v1.16b,v0.16b,v1.16b + sub v2.16b,v0.16b,v2.16b + sub v3.16b,v0.16b,v3.16b + + tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b + tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b + tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b + tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b + + mov w6,v0.s[0] + mov w7,v1.s[0] + mov w9,v2.s[0] + add w7,w6,w7 + mov w6,v3.s[0] + add w7,w7,w9 + add w7,w7,w6 + + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w14,w14,w6 + // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) + eor w6,w12,w13 + eor w9,w14,w8 + eor w6,w6,w9 + movi v1.16b,#64 + movi v2.16b,#128 + movi v3.16b,#192 + mov v0.s[0],w6 + + sub v1.16b,v0.16b,v1.16b + sub v2.16b,v0.16b,v2.16b + sub v3.16b,v0.16b,v3.16b + + tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b + tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b + tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b + tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b + + mov w6,v0.s[0] + mov w7,v1.s[0] + mov w9,v2.s[0] + add w7,w6,w7 + mov w6,v3.s[0] + add w7,w7,w9 + add w7,w7,w6 + + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w15,w15,w6 + subs w11,w11,#1 + b.ne 10b + mov v4.s[0],w15 + mov v4.s[1],w14 + mov v4.s[2],w13 + mov v4.s[3],w12 +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif + st1 {v4.4s},[x1] + b 100f +1: // process last 2 blocks + ld4 {v4.s,v5.s,v6.s,v7.s}[0],[x0],#16 + ld4 {v4.s,v5.s,v6.s,v7.s}[1],[x0],#16 + cmp w2,#2 + b.gt 1f +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif +#ifndef __AARCH64EB__ + rev32 v5.16b,v5.16b +#endif +#ifndef __AARCH64EB__ + rev32 v6.16b,v6.16b +#endif +#ifndef __AARCH64EB__ + rev32 v7.16b,v7.16b +#endif + bl _vpsm4_enc_4blks + st4 {v0.s,v1.s,v2.s,v3.s}[0],[x1],#16 + st4 {v0.s,v1.s,v2.s,v3.s}[1],[x1] + b 100f +1: // process last 3 blocks + ld4 {v4.s,v5.s,v6.s,v7.s}[2],[x0],#16 +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif +#ifndef __AARCH64EB__ + rev32 v5.16b,v5.16b +#endif +#ifndef __AARCH64EB__ + rev32 v6.16b,v6.16b +#endif +#ifndef __AARCH64EB__ + rev32 v7.16b,v7.16b +#endif + bl _vpsm4_enc_4blks + st4 {v0.s,v1.s,v2.s,v3.s}[0],[x1],#16 + st4 {v0.s,v1.s,v2.s,v3.s}[1],[x1],#16 + st4 {v0.s,v1.s,v2.s,v3.s}[2],[x1] +100: + ldp d10,d11,[sp,#16] + ldp d12,d13,[sp,#32] + ldp d14,d15,[sp,#48] + ldp x29,x30,[sp,#64] + ldp d8,d9,[sp],#80 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size vpsm4_ecb_encrypt,.-vpsm4_ecb_encrypt +.globl vpsm4_cbc_encrypt +.type vpsm4_cbc_encrypt,%function +.align 5 +vpsm4_cbc_encrypt: + AARCH64_VALID_CALL_TARGET + lsr x2,x2,4 + adrp x10,.Lsbox + add x10,x10,#:lo12:.Lsbox + ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x10],#64 + ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x10],#64 + ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x10],#64 + ld1 {v28.16b,v29.16b,v30.16b,v31.16b},[x10] + cbz w5,.Ldec + ld1 {v3.4s},[x4] +.Lcbc_4_blocks_enc: + cmp w2,#4 + b.lt 1f + ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64 + eor v4.16b,v4.16b,v3.16b +#ifndef __AARCH64EB__ + rev32 v5.16b,v5.16b +#endif +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif +#ifndef __AARCH64EB__ + rev32 v6.16b,v6.16b +#endif +#ifndef __AARCH64EB__ + rev32 v7.16b,v7.16b +#endif + mov x10,x3 + mov w11,#8 + mov w12,v4.s[0] + mov w13,v4.s[1] + mov w14,v4.s[2] + mov w15,v4.s[3] +10: + ldp w7,w8,[x10],8 + // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) + eor w6,w14,w15 + eor w9,w7,w13 + eor w6,w6,w9 + movi v1.16b,#64 + movi v2.16b,#128 + movi v3.16b,#192 + mov v0.s[0],w6 + + sub v1.16b,v0.16b,v1.16b + sub v2.16b,v0.16b,v2.16b + sub v3.16b,v0.16b,v3.16b + + tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b + tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b + tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b + tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b + + mov w6,v0.s[0] + mov w7,v1.s[0] + mov w9,v2.s[0] + add w7,w6,w7 + mov w6,v3.s[0] + add w7,w7,w9 + add w7,w7,w6 + + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w12,w12,w6 + // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) + eor w6,w14,w15 + eor w9,w12,w8 + eor w6,w6,w9 + movi v1.16b,#64 + movi v2.16b,#128 + movi v3.16b,#192 + mov v0.s[0],w6 + + sub v1.16b,v0.16b,v1.16b + sub v2.16b,v0.16b,v2.16b + sub v3.16b,v0.16b,v3.16b + + tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b + tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b + tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b + tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b + + mov w6,v0.s[0] + mov w7,v1.s[0] + mov w9,v2.s[0] + add w7,w6,w7 + mov w6,v3.s[0] + add w7,w7,w9 + add w7,w7,w6 + + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + ldp w7,w8,[x10],8 + eor w13,w13,w6 + // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) + eor w6,w12,w13 + eor w9,w7,w15 + eor w6,w6,w9 + movi v1.16b,#64 + movi v2.16b,#128 + movi v3.16b,#192 + mov v0.s[0],w6 + + sub v1.16b,v0.16b,v1.16b + sub v2.16b,v0.16b,v2.16b + sub v3.16b,v0.16b,v3.16b + + tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b + tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b + tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b + tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b + + mov w6,v0.s[0] + mov w7,v1.s[0] + mov w9,v2.s[0] + add w7,w6,w7 + mov w6,v3.s[0] + add w7,w7,w9 + add w7,w7,w6 + + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w14,w14,w6 + // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) + eor w6,w12,w13 + eor w9,w14,w8 + eor w6,w6,w9 + movi v1.16b,#64 + movi v2.16b,#128 + movi v3.16b,#192 + mov v0.s[0],w6 + + sub v1.16b,v0.16b,v1.16b + sub v2.16b,v0.16b,v2.16b + sub v3.16b,v0.16b,v3.16b + + tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b + tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b + tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b + tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b + + mov w6,v0.s[0] + mov w7,v1.s[0] + mov w9,v2.s[0] + add w7,w6,w7 + mov w6,v3.s[0] + add w7,w7,w9 + add w7,w7,w6 + + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w15,w15,w6 + subs w11,w11,#1 + b.ne 10b + mov v4.s[0],w15 + mov v4.s[1],w14 + mov v4.s[2],w13 + mov v4.s[3],w12 + eor v5.16b,v5.16b,v4.16b + mov x10,x3 + mov w11,#8 + mov w12,v5.s[0] + mov w13,v5.s[1] + mov w14,v5.s[2] + mov w15,v5.s[3] +10: + ldp w7,w8,[x10],8 + // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) + eor w6,w14,w15 + eor w9,w7,w13 + eor w6,w6,w9 + movi v1.16b,#64 + movi v2.16b,#128 + movi v3.16b,#192 + mov v0.s[0],w6 + + sub v1.16b,v0.16b,v1.16b + sub v2.16b,v0.16b,v2.16b + sub v3.16b,v0.16b,v3.16b + + tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b + tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b + tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b + tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b + + mov w6,v0.s[0] + mov w7,v1.s[0] + mov w9,v2.s[0] + add w7,w6,w7 + mov w6,v3.s[0] + add w7,w7,w9 + add w7,w7,w6 + + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w12,w12,w6 + // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) + eor w6,w14,w15 + eor w9,w12,w8 + eor w6,w6,w9 + movi v1.16b,#64 + movi v2.16b,#128 + movi v3.16b,#192 + mov v0.s[0],w6 + + sub v1.16b,v0.16b,v1.16b + sub v2.16b,v0.16b,v2.16b + sub v3.16b,v0.16b,v3.16b + + tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b + tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b + tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b + tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b + + mov w6,v0.s[0] + mov w7,v1.s[0] + mov w9,v2.s[0] + add w7,w6,w7 + mov w6,v3.s[0] + add w7,w7,w9 + add w7,w7,w6 + + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + ldp w7,w8,[x10],8 + eor w13,w13,w6 + // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) + eor w6,w12,w13 + eor w9,w7,w15 + eor w6,w6,w9 + movi v1.16b,#64 + movi v2.16b,#128 + movi v3.16b,#192 + mov v0.s[0],w6 + + sub v1.16b,v0.16b,v1.16b + sub v2.16b,v0.16b,v2.16b + sub v3.16b,v0.16b,v3.16b + + tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b + tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b + tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b + tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b + + mov w6,v0.s[0] + mov w7,v1.s[0] + mov w9,v2.s[0] + add w7,w6,w7 + mov w6,v3.s[0] + add w7,w7,w9 + add w7,w7,w6 + + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w14,w14,w6 + // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) + eor w6,w12,w13 + eor w9,w14,w8 + eor w6,w6,w9 + movi v1.16b,#64 + movi v2.16b,#128 + movi v3.16b,#192 + mov v0.s[0],w6 + + sub v1.16b,v0.16b,v1.16b + sub v2.16b,v0.16b,v2.16b + sub v3.16b,v0.16b,v3.16b + + tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b + tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b + tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b + tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b + + mov w6,v0.s[0] + mov w7,v1.s[0] + mov w9,v2.s[0] + add w7,w6,w7 + mov w6,v3.s[0] + add w7,w7,w9 + add w7,w7,w6 + + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w15,w15,w6 + subs w11,w11,#1 + b.ne 10b + mov v5.s[0],w15 + mov v5.s[1],w14 + mov v5.s[2],w13 + mov v5.s[3],w12 +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif + eor v6.16b,v6.16b,v5.16b + mov x10,x3 + mov w11,#8 + mov w12,v6.s[0] + mov w13,v6.s[1] + mov w14,v6.s[2] + mov w15,v6.s[3] +10: + ldp w7,w8,[x10],8 + // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) + eor w6,w14,w15 + eor w9,w7,w13 + eor w6,w6,w9 + movi v1.16b,#64 + movi v2.16b,#128 + movi v3.16b,#192 + mov v0.s[0],w6 + + sub v1.16b,v0.16b,v1.16b + sub v2.16b,v0.16b,v2.16b + sub v3.16b,v0.16b,v3.16b + + tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b + tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b + tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b + tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b + + mov w6,v0.s[0] + mov w7,v1.s[0] + mov w9,v2.s[0] + add w7,w6,w7 + mov w6,v3.s[0] + add w7,w7,w9 + add w7,w7,w6 + + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w12,w12,w6 + // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) + eor w6,w14,w15 + eor w9,w12,w8 + eor w6,w6,w9 + movi v1.16b,#64 + movi v2.16b,#128 + movi v3.16b,#192 + mov v0.s[0],w6 + + sub v1.16b,v0.16b,v1.16b + sub v2.16b,v0.16b,v2.16b + sub v3.16b,v0.16b,v3.16b + + tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b + tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b + tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b + tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b + + mov w6,v0.s[0] + mov w7,v1.s[0] + mov w9,v2.s[0] + add w7,w6,w7 + mov w6,v3.s[0] + add w7,w7,w9 + add w7,w7,w6 + + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + ldp w7,w8,[x10],8 + eor w13,w13,w6 + // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) + eor w6,w12,w13 + eor w9,w7,w15 + eor w6,w6,w9 + movi v1.16b,#64 + movi v2.16b,#128 + movi v3.16b,#192 + mov v0.s[0],w6 + + sub v1.16b,v0.16b,v1.16b + sub v2.16b,v0.16b,v2.16b + sub v3.16b,v0.16b,v3.16b + + tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b + tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b + tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b + tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b + + mov w6,v0.s[0] + mov w7,v1.s[0] + mov w9,v2.s[0] + add w7,w6,w7 + mov w6,v3.s[0] + add w7,w7,w9 + add w7,w7,w6 + + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w14,w14,w6 + // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) + eor w6,w12,w13 + eor w9,w14,w8 + eor w6,w6,w9 + movi v1.16b,#64 + movi v2.16b,#128 + movi v3.16b,#192 + mov v0.s[0],w6 + + sub v1.16b,v0.16b,v1.16b + sub v2.16b,v0.16b,v2.16b + sub v3.16b,v0.16b,v3.16b + + tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b + tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b + tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b + tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b + + mov w6,v0.s[0] + mov w7,v1.s[0] + mov w9,v2.s[0] + add w7,w6,w7 + mov w6,v3.s[0] + add w7,w7,w9 + add w7,w7,w6 + + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w15,w15,w6 + subs w11,w11,#1 + b.ne 10b + mov v6.s[0],w15 + mov v6.s[1],w14 + mov v6.s[2],w13 + mov v6.s[3],w12 +#ifndef __AARCH64EB__ + rev32 v5.16b,v5.16b +#endif + eor v7.16b,v7.16b,v6.16b + mov x10,x3 + mov w11,#8 + mov w12,v7.s[0] + mov w13,v7.s[1] + mov w14,v7.s[2] + mov w15,v7.s[3] +10: + ldp w7,w8,[x10],8 + // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) + eor w6,w14,w15 + eor w9,w7,w13 + eor w6,w6,w9 + movi v1.16b,#64 + movi v2.16b,#128 + movi v3.16b,#192 + mov v0.s[0],w6 + + sub v1.16b,v0.16b,v1.16b + sub v2.16b,v0.16b,v2.16b + sub v3.16b,v0.16b,v3.16b + + tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b + tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b + tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b + tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b + + mov w6,v0.s[0] + mov w7,v1.s[0] + mov w9,v2.s[0] + add w7,w6,w7 + mov w6,v3.s[0] + add w7,w7,w9 + add w7,w7,w6 + + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w12,w12,w6 + // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) + eor w6,w14,w15 + eor w9,w12,w8 + eor w6,w6,w9 + movi v1.16b,#64 + movi v2.16b,#128 + movi v3.16b,#192 + mov v0.s[0],w6 + + sub v1.16b,v0.16b,v1.16b + sub v2.16b,v0.16b,v2.16b + sub v3.16b,v0.16b,v3.16b + + tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b + tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b + tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b + tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b + + mov w6,v0.s[0] + mov w7,v1.s[0] + mov w9,v2.s[0] + add w7,w6,w7 + mov w6,v3.s[0] + add w7,w7,w9 + add w7,w7,w6 + + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + ldp w7,w8,[x10],8 + eor w13,w13,w6 + // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) + eor w6,w12,w13 + eor w9,w7,w15 + eor w6,w6,w9 + movi v1.16b,#64 + movi v2.16b,#128 + movi v3.16b,#192 + mov v0.s[0],w6 + + sub v1.16b,v0.16b,v1.16b + sub v2.16b,v0.16b,v2.16b + sub v3.16b,v0.16b,v3.16b + + tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b + tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b + tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b + tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b + + mov w6,v0.s[0] + mov w7,v1.s[0] + mov w9,v2.s[0] + add w7,w6,w7 + mov w6,v3.s[0] + add w7,w7,w9 + add w7,w7,w6 + + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w14,w14,w6 + // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) + eor w6,w12,w13 + eor w9,w14,w8 + eor w6,w6,w9 + movi v1.16b,#64 + movi v2.16b,#128 + movi v3.16b,#192 + mov v0.s[0],w6 + + sub v1.16b,v0.16b,v1.16b + sub v2.16b,v0.16b,v2.16b + sub v3.16b,v0.16b,v3.16b + + tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b + tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b + tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b + tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b + + mov w6,v0.s[0] + mov w7,v1.s[0] + mov w9,v2.s[0] + add w7,w6,w7 + mov w6,v3.s[0] + add w7,w7,w9 + add w7,w7,w6 + + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w15,w15,w6 + subs w11,w11,#1 + b.ne 10b + mov v7.s[0],w15 + mov v7.s[1],w14 + mov v7.s[2],w13 + mov v7.s[3],w12 +#ifndef __AARCH64EB__ + rev32 v6.16b,v6.16b +#endif +#ifndef __AARCH64EB__ + rev32 v7.16b,v7.16b +#endif + orr v3.16b,v7.16b,v7.16b + st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64 + subs w2,w2,#4 + b.ne .Lcbc_4_blocks_enc + b 2f +1: + subs w2,w2,#1 + b.lt 2f + ld1 {v4.4s},[x0],#16 + eor v3.16b,v3.16b,v4.16b +#ifndef __AARCH64EB__ + rev32 v3.16b,v3.16b +#endif + mov x10,x3 + mov w11,#8 + mov w12,v3.s[0] + mov w13,v3.s[1] + mov w14,v3.s[2] + mov w15,v3.s[3] +10: + ldp w7,w8,[x10],8 + // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) + eor w6,w14,w15 + eor w9,w7,w13 + eor w6,w6,w9 + movi v1.16b,#64 + movi v2.16b,#128 + movi v3.16b,#192 + mov v0.s[0],w6 + + sub v1.16b,v0.16b,v1.16b + sub v2.16b,v0.16b,v2.16b + sub v3.16b,v0.16b,v3.16b + + tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b + tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b + tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b + tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b + + mov w6,v0.s[0] + mov w7,v1.s[0] + mov w9,v2.s[0] + add w7,w6,w7 + mov w6,v3.s[0] + add w7,w7,w9 + add w7,w7,w6 + + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w12,w12,w6 + // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) + eor w6,w14,w15 + eor w9,w12,w8 + eor w6,w6,w9 + movi v1.16b,#64 + movi v2.16b,#128 + movi v3.16b,#192 + mov v0.s[0],w6 + + sub v1.16b,v0.16b,v1.16b + sub v2.16b,v0.16b,v2.16b + sub v3.16b,v0.16b,v3.16b + + tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b + tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b + tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b + tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b + + mov w6,v0.s[0] + mov w7,v1.s[0] + mov w9,v2.s[0] + add w7,w6,w7 + mov w6,v3.s[0] + add w7,w7,w9 + add w7,w7,w6 + + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + ldp w7,w8,[x10],8 + eor w13,w13,w6 + // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) + eor w6,w12,w13 + eor w9,w7,w15 + eor w6,w6,w9 + movi v1.16b,#64 + movi v2.16b,#128 + movi v3.16b,#192 + mov v0.s[0],w6 + + sub v1.16b,v0.16b,v1.16b + sub v2.16b,v0.16b,v2.16b + sub v3.16b,v0.16b,v3.16b + + tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b + tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b + tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b + tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b + + mov w6,v0.s[0] + mov w7,v1.s[0] + mov w9,v2.s[0] + add w7,w6,w7 + mov w6,v3.s[0] + add w7,w7,w9 + add w7,w7,w6 + + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w14,w14,w6 + // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) + eor w6,w12,w13 + eor w9,w14,w8 + eor w6,w6,w9 + movi v1.16b,#64 + movi v2.16b,#128 + movi v3.16b,#192 + mov v0.s[0],w6 + + sub v1.16b,v0.16b,v1.16b + sub v2.16b,v0.16b,v2.16b + sub v3.16b,v0.16b,v3.16b + + tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b + tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b + tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b + tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b + + mov w6,v0.s[0] + mov w7,v1.s[0] + mov w9,v2.s[0] + add w7,w6,w7 + mov w6,v3.s[0] + add w7,w7,w9 + add w7,w7,w6 + + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w15,w15,w6 + subs w11,w11,#1 + b.ne 10b + mov v3.s[0],w15 + mov v3.s[1],w14 + mov v3.s[2],w13 + mov v3.s[3],w12 +#ifndef __AARCH64EB__ + rev32 v3.16b,v3.16b +#endif + st1 {v3.4s},[x1],#16 + b 1b +2: + // save back IV + st1 {v3.4s},[x4] + ret + +.Ldec: + // decryption mode starts + AARCH64_SIGN_LINK_REGISTER + stp d8,d9,[sp,#-80]! + stp d10,d11,[sp,#16] + stp d12,d13,[sp,#32] + stp d14,d15,[sp,#48] + stp x29,x30,[sp,#64] +.Lcbc_8_blocks_dec: + cmp w2,#8 + b.lt 1f + ld4 {v4.4s,v5.4s,v6.4s,v7.4s},[x0] + add x10,x0,#64 + ld4 {v8.4s,v9.4s,v10.4s,v11.4s},[x10] +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif +#ifndef __AARCH64EB__ + rev32 v5.16b,v5.16b +#endif +#ifndef __AARCH64EB__ + rev32 v6.16b,v6.16b +#endif +#ifndef __AARCH64EB__ + rev32 v7.16b,v7.16b +#endif +#ifndef __AARCH64EB__ + rev32 v8.16b,v8.16b +#endif +#ifndef __AARCH64EB__ + rev32 v9.16b,v9.16b +#endif +#ifndef __AARCH64EB__ + rev32 v10.16b,v10.16b +#endif +#ifndef __AARCH64EB__ + rev32 v11.16b,v11.16b +#endif + bl _vpsm4_enc_8blks + zip1 v8.4s,v0.4s,v1.4s + zip2 v9.4s,v0.4s,v1.4s + zip1 v10.4s,v2.4s,v3.4s + zip2 v11.4s,v2.4s,v3.4s + zip1 v0.2d,v8.2d,v10.2d + zip2 v1.2d,v8.2d,v10.2d + zip1 v2.2d,v9.2d,v11.2d + zip2 v3.2d,v9.2d,v11.2d + zip1 v8.4s,v4.4s,v5.4s + zip2 v9.4s,v4.4s,v5.4s + zip1 v10.4s,v6.4s,v7.4s + zip2 v11.4s,v6.4s,v7.4s + zip1 v4.2d,v8.2d,v10.2d + zip2 v5.2d,v8.2d,v10.2d + zip1 v6.2d,v9.2d,v11.2d + zip2 v7.2d,v9.2d,v11.2d + ld1 {v15.4s},[x4] + ld1 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64 + // note ivec1 and vtmpx[3] are reusing the same register + // care needs to be taken to avoid conflict + eor v0.16b,v0.16b,v15.16b + ld1 {v12.4s,v13.4s,v14.4s,v15.4s},[x0],#64 + eor v1.16b,v1.16b,v8.16b + eor v2.16b,v2.16b,v9.16b + eor v3.16b,v3.16b,v10.16b + // save back IV + st1 {v15.4s}, [x4] + eor v4.16b,v4.16b,v11.16b + eor v5.16b,v5.16b,v12.16b + eor v6.16b,v6.16b,v13.16b + eor v7.16b,v7.16b,v14.16b + st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64 + st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64 + subs w2,w2,#8 + b.gt .Lcbc_8_blocks_dec + b.eq 100f +1: + ld1 {v15.4s},[x4] +.Lcbc_4_blocks_dec: + cmp w2,#4 + b.lt 1f + ld4 {v4.4s,v5.4s,v6.4s,v7.4s},[x0] +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif +#ifndef __AARCH64EB__ + rev32 v5.16b,v5.16b +#endif +#ifndef __AARCH64EB__ + rev32 v6.16b,v6.16b +#endif +#ifndef __AARCH64EB__ + rev32 v7.16b,v7.16b +#endif + bl _vpsm4_enc_4blks + ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64 + zip1 v8.4s,v0.4s,v1.4s + zip2 v9.4s,v0.4s,v1.4s + zip1 v10.4s,v2.4s,v3.4s + zip2 v11.4s,v2.4s,v3.4s + zip1 v0.2d,v8.2d,v10.2d + zip2 v1.2d,v8.2d,v10.2d + zip1 v2.2d,v9.2d,v11.2d + zip2 v3.2d,v9.2d,v11.2d + eor v0.16b,v0.16b,v15.16b + eor v1.16b,v1.16b,v4.16b + orr v15.16b,v7.16b,v7.16b + eor v2.16b,v2.16b,v5.16b + eor v3.16b,v3.16b,v6.16b + st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64 + subs w2,w2,#4 + b.gt .Lcbc_4_blocks_dec + // save back IV + st1 {v7.4s}, [x4] + b 100f +1: // last block + subs w2,w2,#1 + b.lt 100f + b.gt 1f + ld1 {v4.4s},[x0],#16 + // save back IV + st1 {v4.4s}, [x4] +#ifndef __AARCH64EB__ + rev32 v8.16b,v4.16b +#else + mov v8.16b,v4.16b +#endif + mov x10,x3 + mov w11,#8 + mov w12,v8.s[0] + mov w13,v8.s[1] + mov w14,v8.s[2] + mov w15,v8.s[3] +10: + ldp w7,w8,[x10],8 + // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) + eor w6,w14,w15 + eor w9,w7,w13 + eor w6,w6,w9 + movi v1.16b,#64 + movi v2.16b,#128 + movi v3.16b,#192 + mov v0.s[0],w6 + + sub v1.16b,v0.16b,v1.16b + sub v2.16b,v0.16b,v2.16b + sub v3.16b,v0.16b,v3.16b + + tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b + tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b + tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b + tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b + + mov w6,v0.s[0] + mov w7,v1.s[0] + mov w9,v2.s[0] + add w7,w6,w7 + mov w6,v3.s[0] + add w7,w7,w9 + add w7,w7,w6 + + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w12,w12,w6 + // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) + eor w6,w14,w15 + eor w9,w12,w8 + eor w6,w6,w9 + movi v1.16b,#64 + movi v2.16b,#128 + movi v3.16b,#192 + mov v0.s[0],w6 + + sub v1.16b,v0.16b,v1.16b + sub v2.16b,v0.16b,v2.16b + sub v3.16b,v0.16b,v3.16b + + tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b + tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b + tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b + tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b + + mov w6,v0.s[0] + mov w7,v1.s[0] + mov w9,v2.s[0] + add w7,w6,w7 + mov w6,v3.s[0] + add w7,w7,w9 + add w7,w7,w6 + + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + ldp w7,w8,[x10],8 + eor w13,w13,w6 + // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) + eor w6,w12,w13 + eor w9,w7,w15 + eor w6,w6,w9 + movi v1.16b,#64 + movi v2.16b,#128 + movi v3.16b,#192 + mov v0.s[0],w6 + + sub v1.16b,v0.16b,v1.16b + sub v2.16b,v0.16b,v2.16b + sub v3.16b,v0.16b,v3.16b + + tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b + tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b + tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b + tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b + + mov w6,v0.s[0] + mov w7,v1.s[0] + mov w9,v2.s[0] + add w7,w6,w7 + mov w6,v3.s[0] + add w7,w7,w9 + add w7,w7,w6 + + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w14,w14,w6 + // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) + eor w6,w12,w13 + eor w9,w14,w8 + eor w6,w6,w9 + movi v1.16b,#64 + movi v2.16b,#128 + movi v3.16b,#192 + mov v0.s[0],w6 + + sub v1.16b,v0.16b,v1.16b + sub v2.16b,v0.16b,v2.16b + sub v3.16b,v0.16b,v3.16b + + tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b + tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b + tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b + tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b + + mov w6,v0.s[0] + mov w7,v1.s[0] + mov w9,v2.s[0] + add w7,w6,w7 + mov w6,v3.s[0] + add w7,w7,w9 + add w7,w7,w6 + + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w15,w15,w6 + subs w11,w11,#1 + b.ne 10b + mov v8.s[0],w15 + mov v8.s[1],w14 + mov v8.s[2],w13 + mov v8.s[3],w12 +#ifndef __AARCH64EB__ + rev32 v8.16b,v8.16b +#endif + eor v8.16b,v8.16b,v15.16b + st1 {v8.4s},[x1],#16 + b 100f +1: // last two blocks + ld4 {v4.s,v5.s,v6.s,v7.s}[0],[x0] + add x10,x0,#16 + ld4 {v4.s,v5.s,v6.s,v7.s}[1],[x10],#16 + subs w2,w2,1 + b.gt 1f +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif +#ifndef __AARCH64EB__ + rev32 v5.16b,v5.16b +#endif +#ifndef __AARCH64EB__ + rev32 v6.16b,v6.16b +#endif +#ifndef __AARCH64EB__ + rev32 v7.16b,v7.16b +#endif + bl _vpsm4_enc_4blks + ld1 {v4.4s,v5.4s},[x0],#32 + zip1 v8.4s,v0.4s,v1.4s + zip2 v9.4s,v0.4s,v1.4s + zip1 v10.4s,v2.4s,v3.4s + zip2 v11.4s,v2.4s,v3.4s + zip1 v0.2d,v8.2d,v10.2d + zip2 v1.2d,v8.2d,v10.2d + zip1 v2.2d,v9.2d,v11.2d + zip2 v3.2d,v9.2d,v11.2d + eor v0.16b,v0.16b,v15.16b + eor v1.16b,v1.16b,v4.16b + st1 {v0.4s,v1.4s},[x1],#32 + // save back IV + st1 {v5.4s}, [x4] + b 100f +1: // last 3 blocks + ld4 {v4.s,v5.s,v6.s,v7.s}[2],[x10] +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif +#ifndef __AARCH64EB__ + rev32 v5.16b,v5.16b +#endif +#ifndef __AARCH64EB__ + rev32 v6.16b,v6.16b +#endif +#ifndef __AARCH64EB__ + rev32 v7.16b,v7.16b +#endif + bl _vpsm4_enc_4blks + ld1 {v4.4s,v5.4s,v6.4s},[x0],#48 + zip1 v8.4s,v0.4s,v1.4s + zip2 v9.4s,v0.4s,v1.4s + zip1 v10.4s,v2.4s,v3.4s + zip2 v11.4s,v2.4s,v3.4s + zip1 v0.2d,v8.2d,v10.2d + zip2 v1.2d,v8.2d,v10.2d + zip1 v2.2d,v9.2d,v11.2d + zip2 v3.2d,v9.2d,v11.2d + eor v0.16b,v0.16b,v15.16b + eor v1.16b,v1.16b,v4.16b + eor v2.16b,v2.16b,v5.16b + st1 {v0.4s,v1.4s,v2.4s},[x1],#48 + // save back IV + st1 {v6.4s}, [x4] +100: + ldp d10,d11,[sp,#16] + ldp d12,d13,[sp,#32] + ldp d14,d15,[sp,#48] + ldp x29,x30,[sp,#64] + ldp d8,d9,[sp],#80 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size vpsm4_cbc_encrypt,.-vpsm4_cbc_encrypt +.globl vpsm4_ctr32_encrypt_blocks +.type vpsm4_ctr32_encrypt_blocks,%function +.align 5 +vpsm4_ctr32_encrypt_blocks: + AARCH64_VALID_CALL_TARGET + ld1 {v3.4s},[x4] +#ifndef __AARCH64EB__ + rev32 v3.16b,v3.16b +#endif + adrp x10,.Lsbox + add x10,x10,#:lo12:.Lsbox + ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x10],#64 + ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x10],#64 + ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x10],#64 + ld1 {v28.16b,v29.16b,v30.16b,v31.16b},[x10] + cmp w2,#1 + b.ne 1f + // fast processing for one single block without + // context saving overhead + mov x10,x3 + mov w11,#8 + mov w12,v3.s[0] + mov w13,v3.s[1] + mov w14,v3.s[2] + mov w15,v3.s[3] +10: + ldp w7,w8,[x10],8 + // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) + eor w6,w14,w15 + eor w9,w7,w13 + eor w6,w6,w9 + movi v1.16b,#64 + movi v2.16b,#128 + movi v3.16b,#192 + mov v0.s[0],w6 + + sub v1.16b,v0.16b,v1.16b + sub v2.16b,v0.16b,v2.16b + sub v3.16b,v0.16b,v3.16b + + tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b + tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b + tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b + tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b + + mov w6,v0.s[0] + mov w7,v1.s[0] + mov w9,v2.s[0] + add w7,w6,w7 + mov w6,v3.s[0] + add w7,w7,w9 + add w7,w7,w6 + + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w12,w12,w6 + // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) + eor w6,w14,w15 + eor w9,w12,w8 + eor w6,w6,w9 + movi v1.16b,#64 + movi v2.16b,#128 + movi v3.16b,#192 + mov v0.s[0],w6 + + sub v1.16b,v0.16b,v1.16b + sub v2.16b,v0.16b,v2.16b + sub v3.16b,v0.16b,v3.16b + + tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b + tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b + tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b + tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b + + mov w6,v0.s[0] + mov w7,v1.s[0] + mov w9,v2.s[0] + add w7,w6,w7 + mov w6,v3.s[0] + add w7,w7,w9 + add w7,w7,w6 + + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + ldp w7,w8,[x10],8 + eor w13,w13,w6 + // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) + eor w6,w12,w13 + eor w9,w7,w15 + eor w6,w6,w9 + movi v1.16b,#64 + movi v2.16b,#128 + movi v3.16b,#192 + mov v0.s[0],w6 + + sub v1.16b,v0.16b,v1.16b + sub v2.16b,v0.16b,v2.16b + sub v3.16b,v0.16b,v3.16b + + tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b + tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b + tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b + tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b + + mov w6,v0.s[0] + mov w7,v1.s[0] + mov w9,v2.s[0] + add w7,w6,w7 + mov w6,v3.s[0] + add w7,w7,w9 + add w7,w7,w6 + + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w14,w14,w6 + // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) + eor w6,w12,w13 + eor w9,w14,w8 + eor w6,w6,w9 + movi v1.16b,#64 + movi v2.16b,#128 + movi v3.16b,#192 + mov v0.s[0],w6 + + sub v1.16b,v0.16b,v1.16b + sub v2.16b,v0.16b,v2.16b + sub v3.16b,v0.16b,v3.16b + + tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b + tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b + tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b + tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b + + mov w6,v0.s[0] + mov w7,v1.s[0] + mov w9,v2.s[0] + add w7,w6,w7 + mov w6,v3.s[0] + add w7,w7,w9 + add w7,w7,w6 + + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w15,w15,w6 + subs w11,w11,#1 + b.ne 10b + mov v3.s[0],w15 + mov v3.s[1],w14 + mov v3.s[2],w13 + mov v3.s[3],w12 +#ifndef __AARCH64EB__ + rev32 v3.16b,v3.16b +#endif + ld1 {v4.4s},[x0] + eor v4.16b,v4.16b,v3.16b + st1 {v4.4s},[x1] + ret +1: + AARCH64_SIGN_LINK_REGISTER + stp d8,d9,[sp,#-80]! + stp d10,d11,[sp,#16] + stp d12,d13,[sp,#32] + stp d14,d15,[sp,#48] + stp x29,x30,[sp,#64] + mov w12,v3.s[0] + mov w13,v3.s[1] + mov w14,v3.s[2] + mov w5,v3.s[3] +.Lctr32_4_blocks_process: + cmp w2,#4 + b.lt 1f + dup v4.4s,w12 + dup v5.4s,w13 + dup v6.4s,w14 + mov v7.s[0],w5 + add w5,w5,#1 + mov v7.s[1],w5 + add w5,w5,#1 + mov v7.s[2],w5 + add w5,w5,#1 + mov v7.s[3],w5 + add w5,w5,#1 + cmp w2,#8 + b.ge .Lctr32_8_blocks_process + bl _vpsm4_enc_4blks + ld4 {v12.4s,v13.4s,v14.4s,v15.4s},[x0],#64 + eor v0.16b,v0.16b,v12.16b + eor v1.16b,v1.16b,v13.16b + eor v2.16b,v2.16b,v14.16b + eor v3.16b,v3.16b,v15.16b + st4 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64 + subs w2,w2,#4 + b.ne .Lctr32_4_blocks_process + b 100f +.Lctr32_8_blocks_process: + dup v8.4s,w12 + dup v9.4s,w13 + dup v10.4s,w14 + mov v11.s[0],w5 + add w5,w5,#1 + mov v11.s[1],w5 + add w5,w5,#1 + mov v11.s[2],w5 + add w5,w5,#1 + mov v11.s[3],w5 + add w5,w5,#1 + bl _vpsm4_enc_8blks + ld4 {v12.4s,v13.4s,v14.4s,v15.4s},[x0],#64 + ld4 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64 + eor v0.16b,v0.16b,v12.16b + eor v1.16b,v1.16b,v13.16b + eor v2.16b,v2.16b,v14.16b + eor v3.16b,v3.16b,v15.16b + eor v4.16b,v4.16b,v8.16b + eor v5.16b,v5.16b,v9.16b + eor v6.16b,v6.16b,v10.16b + eor v7.16b,v7.16b,v11.16b + st4 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64 + st4 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64 + subs w2,w2,#8 + b.ne .Lctr32_4_blocks_process + b 100f +1: // last block processing + subs w2,w2,#1 + b.lt 100f + b.gt 1f + mov v3.s[0],w12 + mov v3.s[1],w13 + mov v3.s[2],w14 + mov v3.s[3],w5 + mov x10,x3 + mov w11,#8 + mov w12,v3.s[0] + mov w13,v3.s[1] + mov w14,v3.s[2] + mov w15,v3.s[3] +10: + ldp w7,w8,[x10],8 + // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) + eor w6,w14,w15 + eor w9,w7,w13 + eor w6,w6,w9 + movi v1.16b,#64 + movi v2.16b,#128 + movi v3.16b,#192 + mov v0.s[0],w6 + + sub v1.16b,v0.16b,v1.16b + sub v2.16b,v0.16b,v2.16b + sub v3.16b,v0.16b,v3.16b + + tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b + tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b + tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b + tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b + + mov w6,v0.s[0] + mov w7,v1.s[0] + mov w9,v2.s[0] + add w7,w6,w7 + mov w6,v3.s[0] + add w7,w7,w9 + add w7,w7,w6 + + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w12,w12,w6 + // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) + eor w6,w14,w15 + eor w9,w12,w8 + eor w6,w6,w9 + movi v1.16b,#64 + movi v2.16b,#128 + movi v3.16b,#192 + mov v0.s[0],w6 + + sub v1.16b,v0.16b,v1.16b + sub v2.16b,v0.16b,v2.16b + sub v3.16b,v0.16b,v3.16b + + tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b + tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b + tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b + tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b + + mov w6,v0.s[0] + mov w7,v1.s[0] + mov w9,v2.s[0] + add w7,w6,w7 + mov w6,v3.s[0] + add w7,w7,w9 + add w7,w7,w6 + + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + ldp w7,w8,[x10],8 + eor w13,w13,w6 + // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) + eor w6,w12,w13 + eor w9,w7,w15 + eor w6,w6,w9 + movi v1.16b,#64 + movi v2.16b,#128 + movi v3.16b,#192 + mov v0.s[0],w6 + + sub v1.16b,v0.16b,v1.16b + sub v2.16b,v0.16b,v2.16b + sub v3.16b,v0.16b,v3.16b + + tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b + tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b + tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b + tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b + + mov w6,v0.s[0] + mov w7,v1.s[0] + mov w9,v2.s[0] + add w7,w6,w7 + mov w6,v3.s[0] + add w7,w7,w9 + add w7,w7,w6 + + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w14,w14,w6 + // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) + eor w6,w12,w13 + eor w9,w14,w8 + eor w6,w6,w9 + movi v1.16b,#64 + movi v2.16b,#128 + movi v3.16b,#192 + mov v0.s[0],w6 + + sub v1.16b,v0.16b,v1.16b + sub v2.16b,v0.16b,v2.16b + sub v3.16b,v0.16b,v3.16b + + tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b + tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b + tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b + tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b + + mov w6,v0.s[0] + mov w7,v1.s[0] + mov w9,v2.s[0] + add w7,w6,w7 + mov w6,v3.s[0] + add w7,w7,w9 + add w7,w7,w6 + + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w15,w15,w6 + subs w11,w11,#1 + b.ne 10b + mov v3.s[0],w15 + mov v3.s[1],w14 + mov v3.s[2],w13 + mov v3.s[3],w12 +#ifndef __AARCH64EB__ + rev32 v3.16b,v3.16b +#endif + ld1 {v4.4s},[x0] + eor v4.16b,v4.16b,v3.16b + st1 {v4.4s},[x1] + b 100f +1: // last 2 blocks processing + dup v4.4s,w12 + dup v5.4s,w13 + dup v6.4s,w14 + mov v7.s[0],w5 + add w5,w5,#1 + mov v7.s[1],w5 + subs w2,w2,#1 + b.ne 1f + bl _vpsm4_enc_4blks + ld4 {v12.s,v13.s,v14.s,v15.s}[0],[x0],#16 + ld4 {v12.s,v13.s,v14.s,v15.s}[1],[x0],#16 + eor v0.16b,v0.16b,v12.16b + eor v1.16b,v1.16b,v13.16b + eor v2.16b,v2.16b,v14.16b + eor v3.16b,v3.16b,v15.16b + st4 {v0.s,v1.s,v2.s,v3.s}[0],[x1],#16 + st4 {v0.s,v1.s,v2.s,v3.s}[1],[x1],#16 + b 100f +1: // last 3 blocks processing + add w5,w5,#1 + mov v7.s[2],w5 + bl _vpsm4_enc_4blks + ld4 {v12.s,v13.s,v14.s,v15.s}[0],[x0],#16 + ld4 {v12.s,v13.s,v14.s,v15.s}[1],[x0],#16 + ld4 {v12.s,v13.s,v14.s,v15.s}[2],[x0],#16 + eor v0.16b,v0.16b,v12.16b + eor v1.16b,v1.16b,v13.16b + eor v2.16b,v2.16b,v14.16b + eor v3.16b,v3.16b,v15.16b + st4 {v0.s,v1.s,v2.s,v3.s}[0],[x1],#16 + st4 {v0.s,v1.s,v2.s,v3.s}[1],[x1],#16 + st4 {v0.s,v1.s,v2.s,v3.s}[2],[x1],#16 +100: + ldp d10,d11,[sp,#16] + ldp d12,d13,[sp,#32] + ldp d14,d15,[sp,#48] + ldp x29,x30,[sp,#64] + ldp d8,d9,[sp],#80 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size vpsm4_ctr32_encrypt_blocks,.-vpsm4_ctr32_encrypt_blocks +.globl vpsm4_xts_encrypt_gb +.type vpsm4_xts_encrypt_gb,%function +.align 5 +vpsm4_xts_encrypt_gb: + AARCH64_SIGN_LINK_REGISTER + stp x15, x16, [sp, #-0x10]! + stp x17, x18, [sp, #-0x10]! + stp x19, x20, [sp, #-0x10]! + stp x21, x22, [sp, #-0x10]! + stp x23, x24, [sp, #-0x10]! + stp x25, x26, [sp, #-0x10]! + stp x27, x28, [sp, #-0x10]! + stp x29, x30, [sp, #-0x10]! + stp d8, d9, [sp, #-0x10]! + stp d10, d11, [sp, #-0x10]! + stp d12, d13, [sp, #-0x10]! + stp d14, d15, [sp, #-0x10]! + mov x26,x3 + mov x27,x4 + mov w28,w6 + ld1 {v8.4s}, [x5] + mov x3,x27 + adrp x10,.Lsbox + add x10,x10,#:lo12:.Lsbox + ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x10],#64 + ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x10],#64 + ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x10],#64 + ld1 {v28.16b,v29.16b,v30.16b,v31.16b},[x10] +#ifndef __AARCH64EB__ + rev32 v8.16b,v8.16b +#endif + mov x10,x3 + mov w11,#8 + mov w12,v8.s[0] + mov w13,v8.s[1] + mov w14,v8.s[2] + mov w15,v8.s[3] +10: + ldp w7,w8,[x10],8 + // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) + eor w6,w14,w15 + eor w9,w7,w13 + eor w6,w6,w9 + movi v1.16b,#64 + movi v2.16b,#128 + movi v3.16b,#192 + mov v0.s[0],w6 + + sub v1.16b,v0.16b,v1.16b + sub v2.16b,v0.16b,v2.16b + sub v3.16b,v0.16b,v3.16b + + tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b + tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b + tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b + tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b + + mov w6,v0.s[0] + mov w7,v1.s[0] + mov w9,v2.s[0] + add w7,w6,w7 + mov w6,v3.s[0] + add w7,w7,w9 + add w7,w7,w6 + + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w12,w12,w6 + // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) + eor w6,w14,w15 + eor w9,w12,w8 + eor w6,w6,w9 + movi v1.16b,#64 + movi v2.16b,#128 + movi v3.16b,#192 + mov v0.s[0],w6 + + sub v1.16b,v0.16b,v1.16b + sub v2.16b,v0.16b,v2.16b + sub v3.16b,v0.16b,v3.16b + + tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b + tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b + tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b + tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b + + mov w6,v0.s[0] + mov w7,v1.s[0] + mov w9,v2.s[0] + add w7,w6,w7 + mov w6,v3.s[0] + add w7,w7,w9 + add w7,w7,w6 + + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + ldp w7,w8,[x10],8 + eor w13,w13,w6 + // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) + eor w6,w12,w13 + eor w9,w7,w15 + eor w6,w6,w9 + movi v1.16b,#64 + movi v2.16b,#128 + movi v3.16b,#192 + mov v0.s[0],w6 + + sub v1.16b,v0.16b,v1.16b + sub v2.16b,v0.16b,v2.16b + sub v3.16b,v0.16b,v3.16b + + tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b + tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b + tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b + tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b + + mov w6,v0.s[0] + mov w7,v1.s[0] + mov w9,v2.s[0] + add w7,w6,w7 + mov w6,v3.s[0] + add w7,w7,w9 + add w7,w7,w6 + + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w14,w14,w6 + // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) + eor w6,w12,w13 + eor w9,w14,w8 + eor w6,w6,w9 + movi v1.16b,#64 + movi v2.16b,#128 + movi v3.16b,#192 + mov v0.s[0],w6 + + sub v1.16b,v0.16b,v1.16b + sub v2.16b,v0.16b,v2.16b + sub v3.16b,v0.16b,v3.16b + + tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b + tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b + tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b + tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b + + mov w6,v0.s[0] + mov w7,v1.s[0] + mov w9,v2.s[0] + add w7,w6,w7 + mov w6,v3.s[0] + add w7,w7,w9 + add w7,w7,w6 + + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w15,w15,w6 + subs w11,w11,#1 + b.ne 10b + mov v8.s[0],w15 + mov v8.s[1],w14 + mov v8.s[2],w13 + mov v8.s[3],w12 +#ifndef __AARCH64EB__ + rev32 v8.16b,v8.16b +#endif + mov x3,x26 + and x29,x2,#0x0F + // convert length into blocks + lsr x2,x2,4 + cmp x2,#1 + b.lt .return_gb + + cmp x29,0 + // If the encryption/decryption Length is N times of 16, + // the all blocks are encrypted/decrypted in .xts_encrypt_blocks_gb + b.eq .xts_encrypt_blocks_gb + + // If the encryption/decryption length is not N times of 16, + // the last two blocks are encrypted/decrypted in .last_2blks_tweak_gb or .only_2blks_tweak_gb + // the other blocks are encrypted/decrypted in .xts_encrypt_blocks_gb + subs x2,x2,#1 + b.eq .only_2blks_tweak_gb +.xts_encrypt_blocks_gb: + rbit v8.16b,v8.16b +#ifdef __AARCH64EB__ + rev32 v8.16b,v8.16b +#endif + mov x12,v8.d[0] + mov x13,v8.d[1] + mov w7,0x87 + extr x9,x13,x13,#32 + extr x15,x13,x12,#63 + and w8,w7,w9,asr#31 + eor x14,x8,x12,lsl#1 + mov w7,0x87 + extr x9,x15,x15,#32 + extr x17,x15,x14,#63 + and w8,w7,w9,asr#31 + eor x16,x8,x14,lsl#1 + mov w7,0x87 + extr x9,x17,x17,#32 + extr x19,x17,x16,#63 + and w8,w7,w9,asr#31 + eor x18,x8,x16,lsl#1 + mov w7,0x87 + extr x9,x19,x19,#32 + extr x21,x19,x18,#63 + and w8,w7,w9,asr#31 + eor x20,x8,x18,lsl#1 + mov w7,0x87 + extr x9,x21,x21,#32 + extr x23,x21,x20,#63 + and w8,w7,w9,asr#31 + eor x22,x8,x20,lsl#1 + mov w7,0x87 + extr x9,x23,x23,#32 + extr x25,x23,x22,#63 + and w8,w7,w9,asr#31 + eor x24,x8,x22,lsl#1 + mov w7,0x87 + extr x9,x25,x25,#32 + extr x27,x25,x24,#63 + and w8,w7,w9,asr#31 + eor x26,x8,x24,lsl#1 +.Lxts_8_blocks_process_gb: + cmp x2,#8 + b.lt .Lxts_4_blocks_process_gb + mov v0.d[0],x12 + mov v0.d[1],x13 +#ifdef __AARCH64EB__ + rev32 v0.16b,v0.16b +#endif + mov v1.d[0],x14 + mov v1.d[1],x15 +#ifdef __AARCH64EB__ + rev32 v1.16b,v1.16b +#endif + mov v2.d[0],x16 + mov v2.d[1],x17 +#ifdef __AARCH64EB__ + rev32 v2.16b,v2.16b +#endif + mov v3.d[0],x18 + mov v3.d[1],x19 +#ifdef __AARCH64EB__ + rev32 v3.16b,v3.16b +#endif + mov v12.d[0],x20 + mov v12.d[1],x21 +#ifdef __AARCH64EB__ + rev32 v12.16b,v12.16b +#endif + mov v13.d[0],x22 + mov v13.d[1],x23 +#ifdef __AARCH64EB__ + rev32 v13.16b,v13.16b +#endif + mov v14.d[0],x24 + mov v14.d[1],x25 +#ifdef __AARCH64EB__ + rev32 v14.16b,v14.16b +#endif + mov v15.d[0],x26 + mov v15.d[1],x27 +#ifdef __AARCH64EB__ + rev32 v15.16b,v15.16b +#endif + ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64 + rbit v0.16b,v0.16b + rbit v1.16b,v1.16b + rbit v2.16b,v2.16b + rbit v3.16b,v3.16b + eor v4.16b, v4.16b, v0.16b + eor v5.16b, v5.16b, v1.16b + eor v6.16b, v6.16b, v2.16b + eor v7.16b, v7.16b, v3.16b + ld1 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64 + rbit v12.16b,v12.16b + rbit v13.16b,v13.16b + rbit v14.16b,v14.16b + rbit v15.16b,v15.16b + eor v8.16b, v8.16b, v12.16b + eor v9.16b, v9.16b, v13.16b + eor v10.16b, v10.16b, v14.16b + eor v11.16b, v11.16b, v15.16b +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif +#ifndef __AARCH64EB__ + rev32 v5.16b,v5.16b +#endif +#ifndef __AARCH64EB__ + rev32 v6.16b,v6.16b +#endif +#ifndef __AARCH64EB__ + rev32 v7.16b,v7.16b +#endif +#ifndef __AARCH64EB__ + rev32 v8.16b,v8.16b +#endif +#ifndef __AARCH64EB__ + rev32 v9.16b,v9.16b +#endif +#ifndef __AARCH64EB__ + rev32 v10.16b,v10.16b +#endif +#ifndef __AARCH64EB__ + rev32 v11.16b,v11.16b +#endif + zip1 v0.4s,v4.4s,v5.4s + zip2 v1.4s,v4.4s,v5.4s + zip1 v2.4s,v6.4s,v7.4s + zip2 v3.4s,v6.4s,v7.4s + zip1 v4.2d,v0.2d,v2.2d + zip2 v5.2d,v0.2d,v2.2d + zip1 v6.2d,v1.2d,v3.2d + zip2 v7.2d,v1.2d,v3.2d + zip1 v0.4s,v8.4s,v9.4s + zip2 v1.4s,v8.4s,v9.4s + zip1 v2.4s,v10.4s,v11.4s + zip2 v3.4s,v10.4s,v11.4s + zip1 v8.2d,v0.2d,v2.2d + zip2 v9.2d,v0.2d,v2.2d + zip1 v10.2d,v1.2d,v3.2d + zip2 v11.2d,v1.2d,v3.2d + bl _vpsm4_enc_8blks + zip1 v8.4s,v0.4s,v1.4s + zip2 v9.4s,v0.4s,v1.4s + zip1 v10.4s,v2.4s,v3.4s + zip2 v11.4s,v2.4s,v3.4s + zip1 v0.2d,v8.2d,v10.2d + zip2 v1.2d,v8.2d,v10.2d + zip1 v2.2d,v9.2d,v11.2d + zip2 v3.2d,v9.2d,v11.2d + zip1 v8.4s,v4.4s,v5.4s + zip2 v9.4s,v4.4s,v5.4s + zip1 v10.4s,v6.4s,v7.4s + zip2 v11.4s,v6.4s,v7.4s + zip1 v4.2d,v8.2d,v10.2d + zip2 v5.2d,v8.2d,v10.2d + zip1 v6.2d,v9.2d,v11.2d + zip2 v7.2d,v9.2d,v11.2d + mov v12.d[0],x12 + mov v12.d[1],x13 +#ifdef __AARCH64EB__ + rev32 v12.16b,v12.16b +#endif + mov w7,0x87 + extr x9,x27,x27,#32 + extr x13,x27,x26,#63 + and w8,w7,w9,asr#31 + eor x12,x8,x26,lsl#1 + mov v13.d[0],x14 + mov v13.d[1],x15 +#ifdef __AARCH64EB__ + rev32 v13.16b,v13.16b +#endif + mov w7,0x87 + extr x9,x13,x13,#32 + extr x15,x13,x12,#63 + and w8,w7,w9,asr#31 + eor x14,x8,x12,lsl#1 + mov v14.d[0],x16 + mov v14.d[1],x17 +#ifdef __AARCH64EB__ + rev32 v14.16b,v14.16b +#endif + mov w7,0x87 + extr x9,x15,x15,#32 + extr x17,x15,x14,#63 + and w8,w7,w9,asr#31 + eor x16,x8,x14,lsl#1 + mov v15.d[0],x18 + mov v15.d[1],x19 +#ifdef __AARCH64EB__ + rev32 v15.16b,v15.16b +#endif + mov w7,0x87 + extr x9,x17,x17,#32 + extr x19,x17,x16,#63 + and w8,w7,w9,asr#31 + eor x18,x8,x16,lsl#1 + mov v8.d[0],x20 + mov v8.d[1],x21 +#ifdef __AARCH64EB__ + rev32 v8.16b,v8.16b +#endif + mov w7,0x87 + extr x9,x19,x19,#32 + extr x21,x19,x18,#63 + and w8,w7,w9,asr#31 + eor x20,x8,x18,lsl#1 + mov v9.d[0],x22 + mov v9.d[1],x23 +#ifdef __AARCH64EB__ + rev32 v9.16b,v9.16b +#endif + mov w7,0x87 + extr x9,x21,x21,#32 + extr x23,x21,x20,#63 + and w8,w7,w9,asr#31 + eor x22,x8,x20,lsl#1 + mov v10.d[0],x24 + mov v10.d[1],x25 +#ifdef __AARCH64EB__ + rev32 v10.16b,v10.16b +#endif + mov w7,0x87 + extr x9,x23,x23,#32 + extr x25,x23,x22,#63 + and w8,w7,w9,asr#31 + eor x24,x8,x22,lsl#1 + mov v11.d[0],x26 + mov v11.d[1],x27 +#ifdef __AARCH64EB__ + rev32 v11.16b,v11.16b +#endif + mov w7,0x87 + extr x9,x25,x25,#32 + extr x27,x25,x24,#63 + and w8,w7,w9,asr#31 + eor x26,x8,x24,lsl#1 + eor v0.16b, v0.16b, v12.16b + eor v1.16b, v1.16b, v13.16b + eor v2.16b, v2.16b, v14.16b + eor v3.16b, v3.16b, v15.16b + eor v4.16b, v4.16b, v8.16b + eor v5.16b, v5.16b, v9.16b + eor v6.16b, v6.16b, v10.16b + eor v7.16b, v7.16b, v11.16b + + // save the last tweak + st1 {v11.4s},[x5] + st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64 + st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64 + subs x2,x2,#8 + b.gt .Lxts_8_blocks_process_gb + b 100f +.Lxts_4_blocks_process_gb: + mov v8.d[0],x12 + mov v8.d[1],x13 +#ifdef __AARCH64EB__ + rev32 v8.16b,v8.16b +#endif + mov v9.d[0],x14 + mov v9.d[1],x15 +#ifdef __AARCH64EB__ + rev32 v9.16b,v9.16b +#endif + mov v10.d[0],x16 + mov v10.d[1],x17 +#ifdef __AARCH64EB__ + rev32 v10.16b,v10.16b +#endif + mov v11.d[0],x18 + mov v11.d[1],x19 +#ifdef __AARCH64EB__ + rev32 v11.16b,v11.16b +#endif + cmp x2,#4 + b.lt 1f + ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64 + rbit v8.16b,v8.16b + rbit v9.16b,v9.16b + rbit v10.16b,v10.16b + rbit v11.16b,v11.16b + eor v4.16b, v4.16b, v8.16b + eor v5.16b, v5.16b, v9.16b + eor v6.16b, v6.16b, v10.16b + eor v7.16b, v7.16b, v11.16b +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif +#ifndef __AARCH64EB__ + rev32 v5.16b,v5.16b +#endif +#ifndef __AARCH64EB__ + rev32 v6.16b,v6.16b +#endif +#ifndef __AARCH64EB__ + rev32 v7.16b,v7.16b +#endif + zip1 v0.4s,v4.4s,v5.4s + zip2 v1.4s,v4.4s,v5.4s + zip1 v2.4s,v6.4s,v7.4s + zip2 v3.4s,v6.4s,v7.4s + zip1 v4.2d,v0.2d,v2.2d + zip2 v5.2d,v0.2d,v2.2d + zip1 v6.2d,v1.2d,v3.2d + zip2 v7.2d,v1.2d,v3.2d + bl _vpsm4_enc_4blks + zip1 v4.4s,v0.4s,v1.4s + zip2 v5.4s,v0.4s,v1.4s + zip1 v6.4s,v2.4s,v3.4s + zip2 v7.4s,v2.4s,v3.4s + zip1 v0.2d,v4.2d,v6.2d + zip2 v1.2d,v4.2d,v6.2d + zip1 v2.2d,v5.2d,v7.2d + zip2 v3.2d,v5.2d,v7.2d + eor v0.16b, v0.16b, v8.16b + eor v1.16b, v1.16b, v9.16b + eor v2.16b, v2.16b, v10.16b + eor v3.16b, v3.16b, v11.16b + st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64 + sub x2,x2,#4 + mov v8.d[0],x20 + mov v8.d[1],x21 +#ifdef __AARCH64EB__ + rev32 v8.16b,v8.16b +#endif + mov v9.d[0],x22 + mov v9.d[1],x23 +#ifdef __AARCH64EB__ + rev32 v9.16b,v9.16b +#endif + mov v10.d[0],x24 + mov v10.d[1],x25 +#ifdef __AARCH64EB__ + rev32 v10.16b,v10.16b +#endif + // save the last tweak + st1 {v11.4s},[x5] +1: + // process last block + cmp x2,#1 + b.lt 100f + b.gt 1f + ld1 {v4.4s},[x0],#16 + rbit v8.16b,v8.16b + eor v4.16b, v4.16b, v8.16b +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif + mov x10,x3 + mov w11,#8 + mov w12,v4.s[0] + mov w13,v4.s[1] + mov w14,v4.s[2] + mov w15,v4.s[3] +10: + ldp w7,w8,[x10],8 + // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) + eor w6,w14,w15 + eor w9,w7,w13 + eor w6,w6,w9 + movi v1.16b,#64 + movi v2.16b,#128 + movi v3.16b,#192 + mov v0.s[0],w6 + + sub v1.16b,v0.16b,v1.16b + sub v2.16b,v0.16b,v2.16b + sub v3.16b,v0.16b,v3.16b + + tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b + tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b + tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b + tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b + + mov w6,v0.s[0] + mov w7,v1.s[0] + mov w9,v2.s[0] + add w7,w6,w7 + mov w6,v3.s[0] + add w7,w7,w9 + add w7,w7,w6 + + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w12,w12,w6 + // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) + eor w6,w14,w15 + eor w9,w12,w8 + eor w6,w6,w9 + movi v1.16b,#64 + movi v2.16b,#128 + movi v3.16b,#192 + mov v0.s[0],w6 + + sub v1.16b,v0.16b,v1.16b + sub v2.16b,v0.16b,v2.16b + sub v3.16b,v0.16b,v3.16b + + tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b + tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b + tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b + tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b + + mov w6,v0.s[0] + mov w7,v1.s[0] + mov w9,v2.s[0] + add w7,w6,w7 + mov w6,v3.s[0] + add w7,w7,w9 + add w7,w7,w6 + + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + ldp w7,w8,[x10],8 + eor w13,w13,w6 + // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) + eor w6,w12,w13 + eor w9,w7,w15 + eor w6,w6,w9 + movi v1.16b,#64 + movi v2.16b,#128 + movi v3.16b,#192 + mov v0.s[0],w6 + + sub v1.16b,v0.16b,v1.16b + sub v2.16b,v0.16b,v2.16b + sub v3.16b,v0.16b,v3.16b + + tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b + tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b + tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b + tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b + + mov w6,v0.s[0] + mov w7,v1.s[0] + mov w9,v2.s[0] + add w7,w6,w7 + mov w6,v3.s[0] + add w7,w7,w9 + add w7,w7,w6 + + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w14,w14,w6 + // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) + eor w6,w12,w13 + eor w9,w14,w8 + eor w6,w6,w9 + movi v1.16b,#64 + movi v2.16b,#128 + movi v3.16b,#192 + mov v0.s[0],w6 + + sub v1.16b,v0.16b,v1.16b + sub v2.16b,v0.16b,v2.16b + sub v3.16b,v0.16b,v3.16b + + tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b + tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b + tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b + tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b + + mov w6,v0.s[0] + mov w7,v1.s[0] + mov w9,v2.s[0] + add w7,w6,w7 + mov w6,v3.s[0] + add w7,w7,w9 + add w7,w7,w6 + + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w15,w15,w6 + subs w11,w11,#1 + b.ne 10b + mov v4.s[0],w15 + mov v4.s[1],w14 + mov v4.s[2],w13 + mov v4.s[3],w12 +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif + eor v4.16b, v4.16b, v8.16b + st1 {v4.4s},[x1],#16 + // save the last tweak + st1 {v8.4s},[x5] + b 100f +1: // process last 2 blocks + cmp x2,#2 + b.gt 1f + ld1 {v4.4s,v5.4s},[x0],#32 + rbit v8.16b,v8.16b + rbit v9.16b,v9.16b + eor v4.16b, v4.16b, v8.16b + eor v5.16b, v5.16b, v9.16b +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif +#ifndef __AARCH64EB__ + rev32 v5.16b,v5.16b +#endif + zip1 v0.4s,v4.4s,v5.4s + zip2 v1.4s,v4.4s,v5.4s + zip1 v2.4s,v6.4s,v7.4s + zip2 v3.4s,v6.4s,v7.4s + zip1 v4.2d,v0.2d,v2.2d + zip2 v5.2d,v0.2d,v2.2d + zip1 v6.2d,v1.2d,v3.2d + zip2 v7.2d,v1.2d,v3.2d + bl _vpsm4_enc_4blks + zip1 v4.4s,v0.4s,v1.4s + zip2 v5.4s,v0.4s,v1.4s + zip1 v6.4s,v2.4s,v3.4s + zip2 v7.4s,v2.4s,v3.4s + zip1 v0.2d,v4.2d,v6.2d + zip2 v1.2d,v4.2d,v6.2d + zip1 v2.2d,v5.2d,v7.2d + zip2 v3.2d,v5.2d,v7.2d + eor v0.16b, v0.16b, v8.16b + eor v1.16b, v1.16b, v9.16b + st1 {v0.4s,v1.4s},[x1],#32 + // save the last tweak + st1 {v9.4s},[x5] + b 100f +1: // process last 3 blocks + ld1 {v4.4s,v5.4s,v6.4s},[x0],#48 + rbit v8.16b,v8.16b + rbit v9.16b,v9.16b + rbit v10.16b,v10.16b + eor v4.16b, v4.16b, v8.16b + eor v5.16b, v5.16b, v9.16b + eor v6.16b, v6.16b, v10.16b +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif +#ifndef __AARCH64EB__ + rev32 v5.16b,v5.16b +#endif +#ifndef __AARCH64EB__ + rev32 v6.16b,v6.16b +#endif + zip1 v0.4s,v4.4s,v5.4s + zip2 v1.4s,v4.4s,v5.4s + zip1 v2.4s,v6.4s,v7.4s + zip2 v3.4s,v6.4s,v7.4s + zip1 v4.2d,v0.2d,v2.2d + zip2 v5.2d,v0.2d,v2.2d + zip1 v6.2d,v1.2d,v3.2d + zip2 v7.2d,v1.2d,v3.2d + bl _vpsm4_enc_4blks + zip1 v4.4s,v0.4s,v1.4s + zip2 v5.4s,v0.4s,v1.4s + zip1 v6.4s,v2.4s,v3.4s + zip2 v7.4s,v2.4s,v3.4s + zip1 v0.2d,v4.2d,v6.2d + zip2 v1.2d,v4.2d,v6.2d + zip1 v2.2d,v5.2d,v7.2d + zip2 v3.2d,v5.2d,v7.2d + eor v0.16b, v0.16b, v8.16b + eor v1.16b, v1.16b, v9.16b + eor v2.16b, v2.16b, v10.16b + st1 {v0.4s,v1.4s,v2.4s},[x1],#48 + // save the last tweak + st1 {v10.4s},[x5] +100: + cmp x29,0 + b.eq .return_gb + +// This branch calculates the last two tweaks, +// while the encryption/decryption length is larger than 32 +.last_2blks_tweak_gb: + ld1 {v8.4s},[x5] +#ifdef __AARCH64EB__ + rev32 v8.16b,v8.16b +#endif + rbit v2.16b,v8.16b + adrp x10,.Lxts_magic + ldr q0, [x10, #:lo12:.Lxts_magic] + shl v9.16b, v2.16b, #1 + ext v1.16b, v2.16b, v2.16b,#15 + ushr v1.16b, v1.16b, #7 + mul v1.16b, v1.16b, v0.16b + eor v9.16b, v9.16b, v1.16b + rbit v9.16b,v9.16b + rbit v2.16b,v9.16b + adrp x10,.Lxts_magic + ldr q0, [x10, #:lo12:.Lxts_magic] + shl v10.16b, v2.16b, #1 + ext v1.16b, v2.16b, v2.16b,#15 + ushr v1.16b, v1.16b, #7 + mul v1.16b, v1.16b, v0.16b + eor v10.16b, v10.16b, v1.16b + rbit v10.16b,v10.16b + b .check_dec_gb + + +// This branch calculates the last two tweaks, +// while the encryption/decryption length is equal to 32, who only need two tweaks +.only_2blks_tweak_gb: + mov v9.16b,v8.16b +#ifdef __AARCH64EB__ + rev32 v9.16b,v9.16b +#endif + rbit v2.16b,v9.16b + adrp x10,.Lxts_magic + ldr q0, [x10, #:lo12:.Lxts_magic] + shl v10.16b, v2.16b, #1 + ext v1.16b, v2.16b, v2.16b,#15 + ushr v1.16b, v1.16b, #7 + mul v1.16b, v1.16b, v0.16b + eor v10.16b, v10.16b, v1.16b + rbit v10.16b,v10.16b + b .check_dec_gb + + +// Determine whether encryption or decryption is required. +// The last two tweaks need to be swapped for decryption. +.check_dec_gb: + // encryption:1 decryption:0 + cmp w28,1 + b.eq .process_last_2blks_gb + mov v0.16B,v9.16b + mov v9.16B,v10.16b + mov v10.16B,v0.16b + +.process_last_2blks_gb: +#ifdef __AARCH64EB__ + rev32 v9.16b,v9.16b +#endif +#ifdef __AARCH64EB__ + rev32 v10.16b,v10.16b +#endif + ld1 {v4.4s},[x0],#16 + eor v4.16b, v4.16b, v9.16b +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif + mov x10,x3 + mov w11,#8 + mov w12,v4.s[0] + mov w13,v4.s[1] + mov w14,v4.s[2] + mov w15,v4.s[3] +10: + ldp w7,w8,[x10],8 + // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) + eor w6,w14,w15 + eor w9,w7,w13 + eor w6,w6,w9 + movi v1.16b,#64 + movi v2.16b,#128 + movi v3.16b,#192 + mov v0.s[0],w6 + + sub v1.16b,v0.16b,v1.16b + sub v2.16b,v0.16b,v2.16b + sub v3.16b,v0.16b,v3.16b + + tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b + tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b + tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b + tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b + + mov w6,v0.s[0] + mov w7,v1.s[0] + mov w9,v2.s[0] + add w7,w6,w7 + mov w6,v3.s[0] + add w7,w7,w9 + add w7,w7,w6 + + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w12,w12,w6 + // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) + eor w6,w14,w15 + eor w9,w12,w8 + eor w6,w6,w9 + movi v1.16b,#64 + movi v2.16b,#128 + movi v3.16b,#192 + mov v0.s[0],w6 + + sub v1.16b,v0.16b,v1.16b + sub v2.16b,v0.16b,v2.16b + sub v3.16b,v0.16b,v3.16b + + tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b + tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b + tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b + tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b + + mov w6,v0.s[0] + mov w7,v1.s[0] + mov w9,v2.s[0] + add w7,w6,w7 + mov w6,v3.s[0] + add w7,w7,w9 + add w7,w7,w6 + + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + ldp w7,w8,[x10],8 + eor w13,w13,w6 + // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) + eor w6,w12,w13 + eor w9,w7,w15 + eor w6,w6,w9 + movi v1.16b,#64 + movi v2.16b,#128 + movi v3.16b,#192 + mov v0.s[0],w6 + + sub v1.16b,v0.16b,v1.16b + sub v2.16b,v0.16b,v2.16b + sub v3.16b,v0.16b,v3.16b + + tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b + tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b + tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b + tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b + + mov w6,v0.s[0] + mov w7,v1.s[0] + mov w9,v2.s[0] + add w7,w6,w7 + mov w6,v3.s[0] + add w7,w7,w9 + add w7,w7,w6 + + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w14,w14,w6 + // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) + eor w6,w12,w13 + eor w9,w14,w8 + eor w6,w6,w9 + movi v1.16b,#64 + movi v2.16b,#128 + movi v3.16b,#192 + mov v0.s[0],w6 + + sub v1.16b,v0.16b,v1.16b + sub v2.16b,v0.16b,v2.16b + sub v3.16b,v0.16b,v3.16b + + tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b + tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b + tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b + tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b + + mov w6,v0.s[0] + mov w7,v1.s[0] + mov w9,v2.s[0] + add w7,w6,w7 + mov w6,v3.s[0] + add w7,w7,w9 + add w7,w7,w6 + + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w15,w15,w6 + subs w11,w11,#1 + b.ne 10b + mov v4.s[0],w15 + mov v4.s[1],w14 + mov v4.s[2],w13 + mov v4.s[3],w12 +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif + eor v4.16b, v4.16b, v9.16b + st1 {v4.4s},[x1],#16 + + sub x26,x1,16 +.loop_gb: + subs x29,x29,1 + ldrb w7,[x26,x29] + ldrb w8,[x0,x29] + strb w8,[x26,x29] + strb w7,[x1,x29] + b.gt .loop_gb + ld1 {v4.4s}, [x26] + eor v4.16b, v4.16b, v10.16b +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif + mov x10,x3 + mov w11,#8 + mov w12,v4.s[0] + mov w13,v4.s[1] + mov w14,v4.s[2] + mov w15,v4.s[3] +10: + ldp w7,w8,[x10],8 + // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) + eor w6,w14,w15 + eor w9,w7,w13 + eor w6,w6,w9 + movi v1.16b,#64 + movi v2.16b,#128 + movi v3.16b,#192 + mov v0.s[0],w6 + + sub v1.16b,v0.16b,v1.16b + sub v2.16b,v0.16b,v2.16b + sub v3.16b,v0.16b,v3.16b + + tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b + tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b + tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b + tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b + + mov w6,v0.s[0] + mov w7,v1.s[0] + mov w9,v2.s[0] + add w7,w6,w7 + mov w6,v3.s[0] + add w7,w7,w9 + add w7,w7,w6 + + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w12,w12,w6 + // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) + eor w6,w14,w15 + eor w9,w12,w8 + eor w6,w6,w9 + movi v1.16b,#64 + movi v2.16b,#128 + movi v3.16b,#192 + mov v0.s[0],w6 + + sub v1.16b,v0.16b,v1.16b + sub v2.16b,v0.16b,v2.16b + sub v3.16b,v0.16b,v3.16b + + tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b + tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b + tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b + tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b + + mov w6,v0.s[0] + mov w7,v1.s[0] + mov w9,v2.s[0] + add w7,w6,w7 + mov w6,v3.s[0] + add w7,w7,w9 + add w7,w7,w6 + + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + ldp w7,w8,[x10],8 + eor w13,w13,w6 + // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) + eor w6,w12,w13 + eor w9,w7,w15 + eor w6,w6,w9 + movi v1.16b,#64 + movi v2.16b,#128 + movi v3.16b,#192 + mov v0.s[0],w6 + + sub v1.16b,v0.16b,v1.16b + sub v2.16b,v0.16b,v2.16b + sub v3.16b,v0.16b,v3.16b + + tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b + tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b + tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b + tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b + + mov w6,v0.s[0] + mov w7,v1.s[0] + mov w9,v2.s[0] + add w7,w6,w7 + mov w6,v3.s[0] + add w7,w7,w9 + add w7,w7,w6 + + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w14,w14,w6 + // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) + eor w6,w12,w13 + eor w9,w14,w8 + eor w6,w6,w9 + movi v1.16b,#64 + movi v2.16b,#128 + movi v3.16b,#192 + mov v0.s[0],w6 + + sub v1.16b,v0.16b,v1.16b + sub v2.16b,v0.16b,v2.16b + sub v3.16b,v0.16b,v3.16b + + tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b + tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b + tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b + tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b + + mov w6,v0.s[0] + mov w7,v1.s[0] + mov w9,v2.s[0] + add w7,w6,w7 + mov w6,v3.s[0] + add w7,w7,w9 + add w7,w7,w6 + + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w15,w15,w6 + subs w11,w11,#1 + b.ne 10b + mov v4.s[0],w15 + mov v4.s[1],w14 + mov v4.s[2],w13 + mov v4.s[3],w12 +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif + eor v4.16b, v4.16b, v10.16b + st1 {v4.4s}, [x26] +.return_gb: + ldp d14, d15, [sp], #0x10 + ldp d12, d13, [sp], #0x10 + ldp d10, d11, [sp], #0x10 + ldp d8, d9, [sp], #0x10 + ldp x29, x30, [sp], #0x10 + ldp x27, x28, [sp], #0x10 + ldp x25, x26, [sp], #0x10 + ldp x23, x24, [sp], #0x10 + ldp x21, x22, [sp], #0x10 + ldp x19, x20, [sp], #0x10 + ldp x17, x18, [sp], #0x10 + ldp x15, x16, [sp], #0x10 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size vpsm4_xts_encrypt_gb,.-vpsm4_xts_encrypt_gb +.globl vpsm4_xts_encrypt +.type vpsm4_xts_encrypt,%function +.align 5 +vpsm4_xts_encrypt: + AARCH64_SIGN_LINK_REGISTER + stp x15, x16, [sp, #-0x10]! + stp x17, x18, [sp, #-0x10]! + stp x19, x20, [sp, #-0x10]! + stp x21, x22, [sp, #-0x10]! + stp x23, x24, [sp, #-0x10]! + stp x25, x26, [sp, #-0x10]! + stp x27, x28, [sp, #-0x10]! + stp x29, x30, [sp, #-0x10]! + stp d8, d9, [sp, #-0x10]! + stp d10, d11, [sp, #-0x10]! + stp d12, d13, [sp, #-0x10]! + stp d14, d15, [sp, #-0x10]! + mov x26,x3 + mov x27,x4 + mov w28,w6 + ld1 {v8.4s}, [x5] + mov x3,x27 + adrp x10,.Lsbox + add x10,x10,#:lo12:.Lsbox + ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x10],#64 + ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x10],#64 + ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x10],#64 + ld1 {v28.16b,v29.16b,v30.16b,v31.16b},[x10] +#ifndef __AARCH64EB__ + rev32 v8.16b,v8.16b +#endif + mov x10,x3 + mov w11,#8 + mov w12,v8.s[0] + mov w13,v8.s[1] + mov w14,v8.s[2] + mov w15,v8.s[3] +10: + ldp w7,w8,[x10],8 + // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) + eor w6,w14,w15 + eor w9,w7,w13 + eor w6,w6,w9 + movi v1.16b,#64 + movi v2.16b,#128 + movi v3.16b,#192 + mov v0.s[0],w6 + + sub v1.16b,v0.16b,v1.16b + sub v2.16b,v0.16b,v2.16b + sub v3.16b,v0.16b,v3.16b + + tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b + tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b + tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b + tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b + + mov w6,v0.s[0] + mov w7,v1.s[0] + mov w9,v2.s[0] + add w7,w6,w7 + mov w6,v3.s[0] + add w7,w7,w9 + add w7,w7,w6 + + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w12,w12,w6 + // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) + eor w6,w14,w15 + eor w9,w12,w8 + eor w6,w6,w9 + movi v1.16b,#64 + movi v2.16b,#128 + movi v3.16b,#192 + mov v0.s[0],w6 + + sub v1.16b,v0.16b,v1.16b + sub v2.16b,v0.16b,v2.16b + sub v3.16b,v0.16b,v3.16b + + tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b + tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b + tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b + tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b + + mov w6,v0.s[0] + mov w7,v1.s[0] + mov w9,v2.s[0] + add w7,w6,w7 + mov w6,v3.s[0] + add w7,w7,w9 + add w7,w7,w6 + + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + ldp w7,w8,[x10],8 + eor w13,w13,w6 + // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) + eor w6,w12,w13 + eor w9,w7,w15 + eor w6,w6,w9 + movi v1.16b,#64 + movi v2.16b,#128 + movi v3.16b,#192 + mov v0.s[0],w6 + + sub v1.16b,v0.16b,v1.16b + sub v2.16b,v0.16b,v2.16b + sub v3.16b,v0.16b,v3.16b + + tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b + tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b + tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b + tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b + + mov w6,v0.s[0] + mov w7,v1.s[0] + mov w9,v2.s[0] + add w7,w6,w7 + mov w6,v3.s[0] + add w7,w7,w9 + add w7,w7,w6 + + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w14,w14,w6 + // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) + eor w6,w12,w13 + eor w9,w14,w8 + eor w6,w6,w9 + movi v1.16b,#64 + movi v2.16b,#128 + movi v3.16b,#192 + mov v0.s[0],w6 + + sub v1.16b,v0.16b,v1.16b + sub v2.16b,v0.16b,v2.16b + sub v3.16b,v0.16b,v3.16b + + tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b + tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b + tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b + tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b + + mov w6,v0.s[0] + mov w7,v1.s[0] + mov w9,v2.s[0] + add w7,w6,w7 + mov w6,v3.s[0] + add w7,w7,w9 + add w7,w7,w6 + + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w15,w15,w6 + subs w11,w11,#1 + b.ne 10b + mov v8.s[0],w15 + mov v8.s[1],w14 + mov v8.s[2],w13 + mov v8.s[3],w12 +#ifndef __AARCH64EB__ + rev32 v8.16b,v8.16b +#endif + mov x3,x26 + and x29,x2,#0x0F + // convert length into blocks + lsr x2,x2,4 + cmp x2,#1 + b.lt .return + + cmp x29,0 + // If the encryption/decryption Length is N times of 16, + // the all blocks are encrypted/decrypted in .xts_encrypt_blocks + b.eq .xts_encrypt_blocks + + // If the encryption/decryption length is not N times of 16, + // the last two blocks are encrypted/decrypted in .last_2blks_tweak or .only_2blks_tweak + // the other blocks are encrypted/decrypted in .xts_encrypt_blocks + subs x2,x2,#1 + b.eq .only_2blks_tweak +.xts_encrypt_blocks: +#ifdef __AARCH64EB__ + rev32 v8.16b,v8.16b +#endif + mov x12,v8.d[0] + mov x13,v8.d[1] + mov w7,0x87 + extr x9,x13,x13,#32 + extr x15,x13,x12,#63 + and w8,w7,w9,asr#31 + eor x14,x8,x12,lsl#1 + mov w7,0x87 + extr x9,x15,x15,#32 + extr x17,x15,x14,#63 + and w8,w7,w9,asr#31 + eor x16,x8,x14,lsl#1 + mov w7,0x87 + extr x9,x17,x17,#32 + extr x19,x17,x16,#63 + and w8,w7,w9,asr#31 + eor x18,x8,x16,lsl#1 + mov w7,0x87 + extr x9,x19,x19,#32 + extr x21,x19,x18,#63 + and w8,w7,w9,asr#31 + eor x20,x8,x18,lsl#1 + mov w7,0x87 + extr x9,x21,x21,#32 + extr x23,x21,x20,#63 + and w8,w7,w9,asr#31 + eor x22,x8,x20,lsl#1 + mov w7,0x87 + extr x9,x23,x23,#32 + extr x25,x23,x22,#63 + and w8,w7,w9,asr#31 + eor x24,x8,x22,lsl#1 + mov w7,0x87 + extr x9,x25,x25,#32 + extr x27,x25,x24,#63 + and w8,w7,w9,asr#31 + eor x26,x8,x24,lsl#1 +.Lxts_8_blocks_process: + cmp x2,#8 + b.lt .Lxts_4_blocks_process + mov v0.d[0],x12 + mov v0.d[1],x13 +#ifdef __AARCH64EB__ + rev32 v0.16b,v0.16b +#endif + mov v1.d[0],x14 + mov v1.d[1],x15 +#ifdef __AARCH64EB__ + rev32 v1.16b,v1.16b +#endif + mov v2.d[0],x16 + mov v2.d[1],x17 +#ifdef __AARCH64EB__ + rev32 v2.16b,v2.16b +#endif + mov v3.d[0],x18 + mov v3.d[1],x19 +#ifdef __AARCH64EB__ + rev32 v3.16b,v3.16b +#endif + mov v12.d[0],x20 + mov v12.d[1],x21 +#ifdef __AARCH64EB__ + rev32 v12.16b,v12.16b +#endif + mov v13.d[0],x22 + mov v13.d[1],x23 +#ifdef __AARCH64EB__ + rev32 v13.16b,v13.16b +#endif + mov v14.d[0],x24 + mov v14.d[1],x25 +#ifdef __AARCH64EB__ + rev32 v14.16b,v14.16b +#endif + mov v15.d[0],x26 + mov v15.d[1],x27 +#ifdef __AARCH64EB__ + rev32 v15.16b,v15.16b +#endif + ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64 + eor v4.16b, v4.16b, v0.16b + eor v5.16b, v5.16b, v1.16b + eor v6.16b, v6.16b, v2.16b + eor v7.16b, v7.16b, v3.16b + ld1 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64 + eor v8.16b, v8.16b, v12.16b + eor v9.16b, v9.16b, v13.16b + eor v10.16b, v10.16b, v14.16b + eor v11.16b, v11.16b, v15.16b +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif +#ifndef __AARCH64EB__ + rev32 v5.16b,v5.16b +#endif +#ifndef __AARCH64EB__ + rev32 v6.16b,v6.16b +#endif +#ifndef __AARCH64EB__ + rev32 v7.16b,v7.16b +#endif +#ifndef __AARCH64EB__ + rev32 v8.16b,v8.16b +#endif +#ifndef __AARCH64EB__ + rev32 v9.16b,v9.16b +#endif +#ifndef __AARCH64EB__ + rev32 v10.16b,v10.16b +#endif +#ifndef __AARCH64EB__ + rev32 v11.16b,v11.16b +#endif + zip1 v0.4s,v4.4s,v5.4s + zip2 v1.4s,v4.4s,v5.4s + zip1 v2.4s,v6.4s,v7.4s + zip2 v3.4s,v6.4s,v7.4s + zip1 v4.2d,v0.2d,v2.2d + zip2 v5.2d,v0.2d,v2.2d + zip1 v6.2d,v1.2d,v3.2d + zip2 v7.2d,v1.2d,v3.2d + zip1 v0.4s,v8.4s,v9.4s + zip2 v1.4s,v8.4s,v9.4s + zip1 v2.4s,v10.4s,v11.4s + zip2 v3.4s,v10.4s,v11.4s + zip1 v8.2d,v0.2d,v2.2d + zip2 v9.2d,v0.2d,v2.2d + zip1 v10.2d,v1.2d,v3.2d + zip2 v11.2d,v1.2d,v3.2d + bl _vpsm4_enc_8blks + zip1 v8.4s,v0.4s,v1.4s + zip2 v9.4s,v0.4s,v1.4s + zip1 v10.4s,v2.4s,v3.4s + zip2 v11.4s,v2.4s,v3.4s + zip1 v0.2d,v8.2d,v10.2d + zip2 v1.2d,v8.2d,v10.2d + zip1 v2.2d,v9.2d,v11.2d + zip2 v3.2d,v9.2d,v11.2d + zip1 v8.4s,v4.4s,v5.4s + zip2 v9.4s,v4.4s,v5.4s + zip1 v10.4s,v6.4s,v7.4s + zip2 v11.4s,v6.4s,v7.4s + zip1 v4.2d,v8.2d,v10.2d + zip2 v5.2d,v8.2d,v10.2d + zip1 v6.2d,v9.2d,v11.2d + zip2 v7.2d,v9.2d,v11.2d + mov v12.d[0],x12 + mov v12.d[1],x13 +#ifdef __AARCH64EB__ + rev32 v12.16b,v12.16b +#endif + mov w7,0x87 + extr x9,x27,x27,#32 + extr x13,x27,x26,#63 + and w8,w7,w9,asr#31 + eor x12,x8,x26,lsl#1 + mov v13.d[0],x14 + mov v13.d[1],x15 +#ifdef __AARCH64EB__ + rev32 v13.16b,v13.16b +#endif + mov w7,0x87 + extr x9,x13,x13,#32 + extr x15,x13,x12,#63 + and w8,w7,w9,asr#31 + eor x14,x8,x12,lsl#1 + mov v14.d[0],x16 + mov v14.d[1],x17 +#ifdef __AARCH64EB__ + rev32 v14.16b,v14.16b +#endif + mov w7,0x87 + extr x9,x15,x15,#32 + extr x17,x15,x14,#63 + and w8,w7,w9,asr#31 + eor x16,x8,x14,lsl#1 + mov v15.d[0],x18 + mov v15.d[1],x19 +#ifdef __AARCH64EB__ + rev32 v15.16b,v15.16b +#endif + mov w7,0x87 + extr x9,x17,x17,#32 + extr x19,x17,x16,#63 + and w8,w7,w9,asr#31 + eor x18,x8,x16,lsl#1 + mov v8.d[0],x20 + mov v8.d[1],x21 +#ifdef __AARCH64EB__ + rev32 v8.16b,v8.16b +#endif + mov w7,0x87 + extr x9,x19,x19,#32 + extr x21,x19,x18,#63 + and w8,w7,w9,asr#31 + eor x20,x8,x18,lsl#1 + mov v9.d[0],x22 + mov v9.d[1],x23 +#ifdef __AARCH64EB__ + rev32 v9.16b,v9.16b +#endif + mov w7,0x87 + extr x9,x21,x21,#32 + extr x23,x21,x20,#63 + and w8,w7,w9,asr#31 + eor x22,x8,x20,lsl#1 + mov v10.d[0],x24 + mov v10.d[1],x25 +#ifdef __AARCH64EB__ + rev32 v10.16b,v10.16b +#endif + mov w7,0x87 + extr x9,x23,x23,#32 + extr x25,x23,x22,#63 + and w8,w7,w9,asr#31 + eor x24,x8,x22,lsl#1 + mov v11.d[0],x26 + mov v11.d[1],x27 +#ifdef __AARCH64EB__ + rev32 v11.16b,v11.16b +#endif + mov w7,0x87 + extr x9,x25,x25,#32 + extr x27,x25,x24,#63 + and w8,w7,w9,asr#31 + eor x26,x8,x24,lsl#1 + eor v0.16b, v0.16b, v12.16b + eor v1.16b, v1.16b, v13.16b + eor v2.16b, v2.16b, v14.16b + eor v3.16b, v3.16b, v15.16b + eor v4.16b, v4.16b, v8.16b + eor v5.16b, v5.16b, v9.16b + eor v6.16b, v6.16b, v10.16b + eor v7.16b, v7.16b, v11.16b + + // save the last tweak + st1 {v11.4s},[x5] + st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64 + st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64 + subs x2,x2,#8 + b.gt .Lxts_8_blocks_process + b 100f +.Lxts_4_blocks_process: + mov v8.d[0],x12 + mov v8.d[1],x13 +#ifdef __AARCH64EB__ + rev32 v8.16b,v8.16b +#endif + mov v9.d[0],x14 + mov v9.d[1],x15 +#ifdef __AARCH64EB__ + rev32 v9.16b,v9.16b +#endif + mov v10.d[0],x16 + mov v10.d[1],x17 +#ifdef __AARCH64EB__ + rev32 v10.16b,v10.16b +#endif + mov v11.d[0],x18 + mov v11.d[1],x19 +#ifdef __AARCH64EB__ + rev32 v11.16b,v11.16b +#endif + cmp x2,#4 + b.lt 1f + ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64 + eor v4.16b, v4.16b, v8.16b + eor v5.16b, v5.16b, v9.16b + eor v6.16b, v6.16b, v10.16b + eor v7.16b, v7.16b, v11.16b +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif +#ifndef __AARCH64EB__ + rev32 v5.16b,v5.16b +#endif +#ifndef __AARCH64EB__ + rev32 v6.16b,v6.16b +#endif +#ifndef __AARCH64EB__ + rev32 v7.16b,v7.16b +#endif + zip1 v0.4s,v4.4s,v5.4s + zip2 v1.4s,v4.4s,v5.4s + zip1 v2.4s,v6.4s,v7.4s + zip2 v3.4s,v6.4s,v7.4s + zip1 v4.2d,v0.2d,v2.2d + zip2 v5.2d,v0.2d,v2.2d + zip1 v6.2d,v1.2d,v3.2d + zip2 v7.2d,v1.2d,v3.2d + bl _vpsm4_enc_4blks + zip1 v4.4s,v0.4s,v1.4s + zip2 v5.4s,v0.4s,v1.4s + zip1 v6.4s,v2.4s,v3.4s + zip2 v7.4s,v2.4s,v3.4s + zip1 v0.2d,v4.2d,v6.2d + zip2 v1.2d,v4.2d,v6.2d + zip1 v2.2d,v5.2d,v7.2d + zip2 v3.2d,v5.2d,v7.2d + eor v0.16b, v0.16b, v8.16b + eor v1.16b, v1.16b, v9.16b + eor v2.16b, v2.16b, v10.16b + eor v3.16b, v3.16b, v11.16b + st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64 + sub x2,x2,#4 + mov v8.d[0],x20 + mov v8.d[1],x21 +#ifdef __AARCH64EB__ + rev32 v8.16b,v8.16b +#endif + mov v9.d[0],x22 + mov v9.d[1],x23 +#ifdef __AARCH64EB__ + rev32 v9.16b,v9.16b +#endif + mov v10.d[0],x24 + mov v10.d[1],x25 +#ifdef __AARCH64EB__ + rev32 v10.16b,v10.16b +#endif + // save the last tweak + st1 {v11.4s},[x5] +1: + // process last block + cmp x2,#1 + b.lt 100f + b.gt 1f + ld1 {v4.4s},[x0],#16 + eor v4.16b, v4.16b, v8.16b +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif + mov x10,x3 + mov w11,#8 + mov w12,v4.s[0] + mov w13,v4.s[1] + mov w14,v4.s[2] + mov w15,v4.s[3] +10: + ldp w7,w8,[x10],8 + // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) + eor w6,w14,w15 + eor w9,w7,w13 + eor w6,w6,w9 + movi v1.16b,#64 + movi v2.16b,#128 + movi v3.16b,#192 + mov v0.s[0],w6 + + sub v1.16b,v0.16b,v1.16b + sub v2.16b,v0.16b,v2.16b + sub v3.16b,v0.16b,v3.16b + + tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b + tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b + tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b + tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b + + mov w6,v0.s[0] + mov w7,v1.s[0] + mov w9,v2.s[0] + add w7,w6,w7 + mov w6,v3.s[0] + add w7,w7,w9 + add w7,w7,w6 + + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w12,w12,w6 + // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) + eor w6,w14,w15 + eor w9,w12,w8 + eor w6,w6,w9 + movi v1.16b,#64 + movi v2.16b,#128 + movi v3.16b,#192 + mov v0.s[0],w6 + + sub v1.16b,v0.16b,v1.16b + sub v2.16b,v0.16b,v2.16b + sub v3.16b,v0.16b,v3.16b + + tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b + tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b + tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b + tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b + + mov w6,v0.s[0] + mov w7,v1.s[0] + mov w9,v2.s[0] + add w7,w6,w7 + mov w6,v3.s[0] + add w7,w7,w9 + add w7,w7,w6 + + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + ldp w7,w8,[x10],8 + eor w13,w13,w6 + // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) + eor w6,w12,w13 + eor w9,w7,w15 + eor w6,w6,w9 + movi v1.16b,#64 + movi v2.16b,#128 + movi v3.16b,#192 + mov v0.s[0],w6 + + sub v1.16b,v0.16b,v1.16b + sub v2.16b,v0.16b,v2.16b + sub v3.16b,v0.16b,v3.16b + + tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b + tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b + tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b + tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b + + mov w6,v0.s[0] + mov w7,v1.s[0] + mov w9,v2.s[0] + add w7,w6,w7 + mov w6,v3.s[0] + add w7,w7,w9 + add w7,w7,w6 + + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w14,w14,w6 + // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) + eor w6,w12,w13 + eor w9,w14,w8 + eor w6,w6,w9 + movi v1.16b,#64 + movi v2.16b,#128 + movi v3.16b,#192 + mov v0.s[0],w6 + + sub v1.16b,v0.16b,v1.16b + sub v2.16b,v0.16b,v2.16b + sub v3.16b,v0.16b,v3.16b + + tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b + tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b + tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b + tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b + + mov w6,v0.s[0] + mov w7,v1.s[0] + mov w9,v2.s[0] + add w7,w6,w7 + mov w6,v3.s[0] + add w7,w7,w9 + add w7,w7,w6 + + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w15,w15,w6 + subs w11,w11,#1 + b.ne 10b + mov v4.s[0],w15 + mov v4.s[1],w14 + mov v4.s[2],w13 + mov v4.s[3],w12 +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif + eor v4.16b, v4.16b, v8.16b + st1 {v4.4s},[x1],#16 + // save the last tweak + st1 {v8.4s},[x5] + b 100f +1: // process last 2 blocks + cmp x2,#2 + b.gt 1f + ld1 {v4.4s,v5.4s},[x0],#32 + eor v4.16b, v4.16b, v8.16b + eor v5.16b, v5.16b, v9.16b +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif +#ifndef __AARCH64EB__ + rev32 v5.16b,v5.16b +#endif + zip1 v0.4s,v4.4s,v5.4s + zip2 v1.4s,v4.4s,v5.4s + zip1 v2.4s,v6.4s,v7.4s + zip2 v3.4s,v6.4s,v7.4s + zip1 v4.2d,v0.2d,v2.2d + zip2 v5.2d,v0.2d,v2.2d + zip1 v6.2d,v1.2d,v3.2d + zip2 v7.2d,v1.2d,v3.2d + bl _vpsm4_enc_4blks + zip1 v4.4s,v0.4s,v1.4s + zip2 v5.4s,v0.4s,v1.4s + zip1 v6.4s,v2.4s,v3.4s + zip2 v7.4s,v2.4s,v3.4s + zip1 v0.2d,v4.2d,v6.2d + zip2 v1.2d,v4.2d,v6.2d + zip1 v2.2d,v5.2d,v7.2d + zip2 v3.2d,v5.2d,v7.2d + eor v0.16b, v0.16b, v8.16b + eor v1.16b, v1.16b, v9.16b + st1 {v0.4s,v1.4s},[x1],#32 + // save the last tweak + st1 {v9.4s},[x5] + b 100f +1: // process last 3 blocks + ld1 {v4.4s,v5.4s,v6.4s},[x0],#48 + eor v4.16b, v4.16b, v8.16b + eor v5.16b, v5.16b, v9.16b + eor v6.16b, v6.16b, v10.16b +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif +#ifndef __AARCH64EB__ + rev32 v5.16b,v5.16b +#endif +#ifndef __AARCH64EB__ + rev32 v6.16b,v6.16b +#endif + zip1 v0.4s,v4.4s,v5.4s + zip2 v1.4s,v4.4s,v5.4s + zip1 v2.4s,v6.4s,v7.4s + zip2 v3.4s,v6.4s,v7.4s + zip1 v4.2d,v0.2d,v2.2d + zip2 v5.2d,v0.2d,v2.2d + zip1 v6.2d,v1.2d,v3.2d + zip2 v7.2d,v1.2d,v3.2d + bl _vpsm4_enc_4blks + zip1 v4.4s,v0.4s,v1.4s + zip2 v5.4s,v0.4s,v1.4s + zip1 v6.4s,v2.4s,v3.4s + zip2 v7.4s,v2.4s,v3.4s + zip1 v0.2d,v4.2d,v6.2d + zip2 v1.2d,v4.2d,v6.2d + zip1 v2.2d,v5.2d,v7.2d + zip2 v3.2d,v5.2d,v7.2d + eor v0.16b, v0.16b, v8.16b + eor v1.16b, v1.16b, v9.16b + eor v2.16b, v2.16b, v10.16b + st1 {v0.4s,v1.4s,v2.4s},[x1],#48 + // save the last tweak + st1 {v10.4s},[x5] +100: + cmp x29,0 + b.eq .return + +// This branch calculates the last two tweaks, +// while the encryption/decryption length is larger than 32 +.last_2blks_tweak: + ld1 {v8.4s},[x5] +#ifdef __AARCH64EB__ + rev32 v8.16b,v8.16b +#endif + mov v2.16b,v8.16b + adrp x10,.Lxts_magic + ldr q0, [x10, #:lo12:.Lxts_magic] + shl v9.16b, v2.16b, #1 + ext v1.16b, v2.16b, v2.16b,#15 + ushr v1.16b, v1.16b, #7 + mul v1.16b, v1.16b, v0.16b + eor v9.16b, v9.16b, v1.16b + mov v2.16b,v9.16b + adrp x10,.Lxts_magic + ldr q0, [x10, #:lo12:.Lxts_magic] + shl v10.16b, v2.16b, #1 + ext v1.16b, v2.16b, v2.16b,#15 + ushr v1.16b, v1.16b, #7 + mul v1.16b, v1.16b, v0.16b + eor v10.16b, v10.16b, v1.16b + b .check_dec + + +// This branch calculates the last two tweaks, +// while the encryption/decryption length is equal to 32, who only need two tweaks +.only_2blks_tweak: + mov v9.16b,v8.16b +#ifdef __AARCH64EB__ + rev32 v9.16b,v9.16b +#endif + mov v2.16b,v9.16b + adrp x10,.Lxts_magic + ldr q0, [x10, #:lo12:.Lxts_magic] + shl v10.16b, v2.16b, #1 + ext v1.16b, v2.16b, v2.16b,#15 + ushr v1.16b, v1.16b, #7 + mul v1.16b, v1.16b, v0.16b + eor v10.16b, v10.16b, v1.16b + b .check_dec + + +// Determine whether encryption or decryption is required. +// The last two tweaks need to be swapped for decryption. +.check_dec: + // encryption:1 decryption:0 + cmp w28,1 + b.eq .process_last_2blks + mov v0.16B,v9.16b + mov v9.16B,v10.16b + mov v10.16B,v0.16b + +.process_last_2blks: +#ifdef __AARCH64EB__ + rev32 v9.16b,v9.16b +#endif +#ifdef __AARCH64EB__ + rev32 v10.16b,v10.16b +#endif + ld1 {v4.4s},[x0],#16 + eor v4.16b, v4.16b, v9.16b +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif + mov x10,x3 + mov w11,#8 + mov w12,v4.s[0] + mov w13,v4.s[1] + mov w14,v4.s[2] + mov w15,v4.s[3] +10: + ldp w7,w8,[x10],8 + // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) + eor w6,w14,w15 + eor w9,w7,w13 + eor w6,w6,w9 + movi v1.16b,#64 + movi v2.16b,#128 + movi v3.16b,#192 + mov v0.s[0],w6 + + sub v1.16b,v0.16b,v1.16b + sub v2.16b,v0.16b,v2.16b + sub v3.16b,v0.16b,v3.16b + + tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b + tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b + tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b + tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b + + mov w6,v0.s[0] + mov w7,v1.s[0] + mov w9,v2.s[0] + add w7,w6,w7 + mov w6,v3.s[0] + add w7,w7,w9 + add w7,w7,w6 + + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w12,w12,w6 + // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) + eor w6,w14,w15 + eor w9,w12,w8 + eor w6,w6,w9 + movi v1.16b,#64 + movi v2.16b,#128 + movi v3.16b,#192 + mov v0.s[0],w6 + + sub v1.16b,v0.16b,v1.16b + sub v2.16b,v0.16b,v2.16b + sub v3.16b,v0.16b,v3.16b + + tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b + tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b + tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b + tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b + + mov w6,v0.s[0] + mov w7,v1.s[0] + mov w9,v2.s[0] + add w7,w6,w7 + mov w6,v3.s[0] + add w7,w7,w9 + add w7,w7,w6 + + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + ldp w7,w8,[x10],8 + eor w13,w13,w6 + // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) + eor w6,w12,w13 + eor w9,w7,w15 + eor w6,w6,w9 + movi v1.16b,#64 + movi v2.16b,#128 + movi v3.16b,#192 + mov v0.s[0],w6 + + sub v1.16b,v0.16b,v1.16b + sub v2.16b,v0.16b,v2.16b + sub v3.16b,v0.16b,v3.16b + + tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b + tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b + tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b + tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b + + mov w6,v0.s[0] + mov w7,v1.s[0] + mov w9,v2.s[0] + add w7,w6,w7 + mov w6,v3.s[0] + add w7,w7,w9 + add w7,w7,w6 + + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w14,w14,w6 + // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) + eor w6,w12,w13 + eor w9,w14,w8 + eor w6,w6,w9 + movi v1.16b,#64 + movi v2.16b,#128 + movi v3.16b,#192 + mov v0.s[0],w6 + + sub v1.16b,v0.16b,v1.16b + sub v2.16b,v0.16b,v2.16b + sub v3.16b,v0.16b,v3.16b + + tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b + tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b + tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b + tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b + + mov w6,v0.s[0] + mov w7,v1.s[0] + mov w9,v2.s[0] + add w7,w6,w7 + mov w6,v3.s[0] + add w7,w7,w9 + add w7,w7,w6 + + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w15,w15,w6 + subs w11,w11,#1 + b.ne 10b + mov v4.s[0],w15 + mov v4.s[1],w14 + mov v4.s[2],w13 + mov v4.s[3],w12 +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif + eor v4.16b, v4.16b, v9.16b + st1 {v4.4s},[x1],#16 + + sub x26,x1,16 +.loop: + subs x29,x29,1 + ldrb w7,[x26,x29] + ldrb w8,[x0,x29] + strb w8,[x26,x29] + strb w7,[x1,x29] + b.gt .loop + ld1 {v4.4s}, [x26] + eor v4.16b, v4.16b, v10.16b +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif + mov x10,x3 + mov w11,#8 + mov w12,v4.s[0] + mov w13,v4.s[1] + mov w14,v4.s[2] + mov w15,v4.s[3] +10: + ldp w7,w8,[x10],8 + // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) + eor w6,w14,w15 + eor w9,w7,w13 + eor w6,w6,w9 + movi v1.16b,#64 + movi v2.16b,#128 + movi v3.16b,#192 + mov v0.s[0],w6 + + sub v1.16b,v0.16b,v1.16b + sub v2.16b,v0.16b,v2.16b + sub v3.16b,v0.16b,v3.16b + + tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b + tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b + tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b + tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b + + mov w6,v0.s[0] + mov w7,v1.s[0] + mov w9,v2.s[0] + add w7,w6,w7 + mov w6,v3.s[0] + add w7,w7,w9 + add w7,w7,w6 + + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w12,w12,w6 + // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) + eor w6,w14,w15 + eor w9,w12,w8 + eor w6,w6,w9 + movi v1.16b,#64 + movi v2.16b,#128 + movi v3.16b,#192 + mov v0.s[0],w6 + + sub v1.16b,v0.16b,v1.16b + sub v2.16b,v0.16b,v2.16b + sub v3.16b,v0.16b,v3.16b + + tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b + tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b + tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b + tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b + + mov w6,v0.s[0] + mov w7,v1.s[0] + mov w9,v2.s[0] + add w7,w6,w7 + mov w6,v3.s[0] + add w7,w7,w9 + add w7,w7,w6 + + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + ldp w7,w8,[x10],8 + eor w13,w13,w6 + // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) + eor w6,w12,w13 + eor w9,w7,w15 + eor w6,w6,w9 + movi v1.16b,#64 + movi v2.16b,#128 + movi v3.16b,#192 + mov v0.s[0],w6 + + sub v1.16b,v0.16b,v1.16b + sub v2.16b,v0.16b,v2.16b + sub v3.16b,v0.16b,v3.16b + + tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b + tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b + tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b + tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b + + mov w6,v0.s[0] + mov w7,v1.s[0] + mov w9,v2.s[0] + add w7,w6,w7 + mov w6,v3.s[0] + add w7,w7,w9 + add w7,w7,w6 + + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w14,w14,w6 + // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) + eor w6,w12,w13 + eor w9,w14,w8 + eor w6,w6,w9 + movi v1.16b,#64 + movi v2.16b,#128 + movi v3.16b,#192 + mov v0.s[0],w6 + + sub v1.16b,v0.16b,v1.16b + sub v2.16b,v0.16b,v2.16b + sub v3.16b,v0.16b,v3.16b + + tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b + tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b + tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b + tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b + + mov w6,v0.s[0] + mov w7,v1.s[0] + mov w9,v2.s[0] + add w7,w6,w7 + mov w6,v3.s[0] + add w7,w7,w9 + add w7,w7,w6 + + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w15,w15,w6 + subs w11,w11,#1 + b.ne 10b + mov v4.s[0],w15 + mov v4.s[1],w14 + mov v4.s[2],w13 + mov v4.s[3],w12 +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif + eor v4.16b, v4.16b, v10.16b + st1 {v4.4s}, [x26] +.return: + ldp d14, d15, [sp], #0x10 + ldp d12, d13, [sp], #0x10 + ldp d10, d11, [sp], #0x10 + ldp d8, d9, [sp], #0x10 + ldp x29, x30, [sp], #0x10 + ldp x27, x28, [sp], #0x10 + ldp x25, x26, [sp], #0x10 + ldp x23, x24, [sp], #0x10 + ldp x21, x22, [sp], #0x10 + ldp x19, x20, [sp], #0x10 + ldp x17, x18, [sp], #0x10 + ldp x15, x16, [sp], #0x10 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size vpsm4_xts_encrypt,.-vpsm4_xts_encrypt diff --git a/sys/crypto/openssl/aarch64/vpsm4_ex-armv8.S b/sys/crypto/openssl/aarch64/vpsm4_ex-armv8.S new file mode 100644 index 000000000000..5627d6d1c6b4 --- /dev/null +++ b/sys/crypto/openssl/aarch64/vpsm4_ex-armv8.S @@ -0,0 +1,4523 @@ +/* Do not modify. This file is auto-generated from vpsm4_ex-armv8.pl. */ +// Copyright 2022-2025 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License 2.0 (the "License"). You may not use +// this file except in compliance with the License. You can obtain a copy +// in the file LICENSE in the source distribution or at +// https://www.openssl.org/source/license.html + +// +// This module implements SM4 with ASIMD and AESE on AARCH64 +// +// Dec 2022 +// + +// $output is the last argument if it looks like a file (it has an extension) +// $flavour is the first argument if it doesn't look like a file +#include "arm_arch.h" +.arch armv8-a+crypto +.text + +.type _vpsm4_ex_consts,%object +.align 7 +_vpsm4_ex_consts: +.Lck: +.long 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269 +.long 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9 +.long 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249 +.long 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9 +.long 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229 +.long 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299 +.long 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209 +.long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279 +.Lfk: +.quad 0x56aa3350a3b1bac6,0xb27022dc677d9197 +.Lshuffles: +.quad 0x0B0A090807060504,0x030201000F0E0D0C +.Lxts_magic: +.quad 0x0101010101010187,0x0101010101010101 +.Lsbox_magic: +.quad 0x0b0e0104070a0d00,0x0306090c0f020508 +.quad 0x62185a2042387a00,0x22581a6002783a40 +.quad 0x15df62a89e54e923,0xc10bb67c4a803df7 +.quad 0xb9aa6b78c1d21300,0x1407c6d56c7fbead +.quad 0x6404462679195b3b,0xe383c1a1fe9edcbc +.quad 0x0f0f0f0f0f0f0f0f,0x0f0f0f0f0f0f0f0f + +.size _vpsm4_ex_consts,.-_vpsm4_ex_consts +.type _vpsm4_ex_set_key,%function +.align 4 +_vpsm4_ex_set_key: + AARCH64_VALID_CALL_TARGET + ld1 {v5.4s},[x0] + adrp x9, .Lsbox_magic + ldr q26, [x9, #:lo12:.Lsbox_magic] + ldr q27, [x9, #:lo12:.Lsbox_magic+16] + ldr q28, [x9, #:lo12:.Lsbox_magic+32] + ldr q29, [x9, #:lo12:.Lsbox_magic+48] + ldr q30, [x9, #:lo12:.Lsbox_magic+64] + ldr q31, [x9, #:lo12:.Lsbox_magic+80] +#ifndef __AARCH64EB__ + rev32 v5.16b,v5.16b +#endif + adrp x5,.Lshuffles + add x5,x5,#:lo12:.Lshuffles + ld1 {v7.2d},[x5] + adrp x5,.Lfk + add x5,x5,#:lo12:.Lfk + ld1 {v6.2d},[x5] + eor v5.16b,v5.16b,v6.16b + mov x6,#32 + adrp x5,.Lck + add x5,x5,#:lo12:.Lck + movi v0.16b,#64 + cbnz w2,1f + add x1,x1,124 +1: + mov w7,v5.s[1] + ldr w8,[x5],#4 + eor w8,w8,w7 + mov w7,v5.s[2] + eor w8,w8,w7 + mov w7,v5.s[3] + eor w8,w8,w7 + // optimize sbox using AESE instruction + mov v4.s[0],w8 + tbl v0.16b, {v4.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + mov w7,v0.s[0] + eor w8,w7,w7,ror #19 + eor w8,w8,w7,ror #9 + mov w7,v5.s[0] + eor w8,w8,w7 + mov v5.s[0],w8 + cbz w2,2f + str w8,[x1],#4 + b 3f +2: + str w8,[x1],#-4 +3: + tbl v5.16b,{v5.16b},v7.16b + subs x6,x6,#1 + b.ne 1b + ret +.size _vpsm4_ex_set_key,.-_vpsm4_ex_set_key +.type _vpsm4_ex_enc_4blks,%function +.align 4 +_vpsm4_ex_enc_4blks: + AARCH64_VALID_CALL_TARGET + mov x10,x3 + mov w11,#8 +10: + ldp w7,w8,[x10],8 + dup v12.4s,w7 + dup v13.4s,w8 + + // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) + eor v14.16b,v6.16b,v7.16b + eor v12.16b,v5.16b,v12.16b + eor v12.16b,v14.16b,v12.16b + // optimize sbox using AESE instruction + tbl v0.16b, {v12.16b}, v26.16b + ushr v24.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v24.16b, {v27.16b}, v24.16b + eor v0.16b, v0.16b, v24.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v24.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v24.16b, {v29.16b}, v24.16b + eor v0.16b, v0.16b, v24.16b + mov v12.16b,v0.16b + + // linear transformation + ushr v0.4s,v12.4s,32-2 + ushr v1.4s,v12.4s,32-10 + ushr v2.4s,v12.4s,32-18 + ushr v3.4s,v12.4s,32-24 + sli v0.4s,v12.4s,2 + sli v1.4s,v12.4s,10 + sli v2.4s,v12.4s,18 + sli v3.4s,v12.4s,24 + eor v24.16b,v0.16b,v12.16b + eor v24.16b,v24.16b,v1.16b + eor v12.16b,v2.16b,v3.16b + eor v12.16b,v12.16b,v24.16b + eor v4.16b,v4.16b,v12.16b + + // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) + eor v14.16b,v14.16b,v4.16b + eor v13.16b,v14.16b,v13.16b + // optimize sbox using AESE instruction + tbl v0.16b, {v13.16b}, v26.16b + ushr v24.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v24.16b, {v27.16b}, v24.16b + eor v0.16b, v0.16b, v24.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v24.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v24.16b, {v29.16b}, v24.16b + eor v0.16b, v0.16b, v24.16b + mov v13.16b,v0.16b + + // linear transformation + ushr v0.4s,v13.4s,32-2 + ushr v1.4s,v13.4s,32-10 + ushr v2.4s,v13.4s,32-18 + ushr v3.4s,v13.4s,32-24 + sli v0.4s,v13.4s,2 + sli v1.4s,v13.4s,10 + sli v2.4s,v13.4s,18 + sli v3.4s,v13.4s,24 + eor v24.16b,v0.16b,v13.16b + eor v24.16b,v24.16b,v1.16b + eor v13.16b,v2.16b,v3.16b + eor v13.16b,v13.16b,v24.16b + ldp w7,w8,[x10],8 + eor v5.16b,v5.16b,v13.16b + + dup v12.4s,w7 + dup v13.4s,w8 + + // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) + eor v14.16b,v4.16b,v5.16b + eor v12.16b,v7.16b,v12.16b + eor v12.16b,v14.16b,v12.16b + // optimize sbox using AESE instruction + tbl v0.16b, {v12.16b}, v26.16b + ushr v24.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v24.16b, {v27.16b}, v24.16b + eor v0.16b, v0.16b, v24.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v24.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v24.16b, {v29.16b}, v24.16b + eor v0.16b, v0.16b, v24.16b + mov v12.16b,v0.16b + + // linear transformation + ushr v0.4s,v12.4s,32-2 + ushr v1.4s,v12.4s,32-10 + ushr v2.4s,v12.4s,32-18 + ushr v3.4s,v12.4s,32-24 + sli v0.4s,v12.4s,2 + sli v1.4s,v12.4s,10 + sli v2.4s,v12.4s,18 + sli v3.4s,v12.4s,24 + eor v24.16b,v0.16b,v12.16b + eor v24.16b,v24.16b,v1.16b + eor v12.16b,v2.16b,v3.16b + eor v12.16b,v12.16b,v24.16b + eor v6.16b,v6.16b,v12.16b + + // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) + eor v14.16b,v14.16b,v6.16b + eor v13.16b,v14.16b,v13.16b + // optimize sbox using AESE instruction + tbl v0.16b, {v13.16b}, v26.16b + ushr v24.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v24.16b, {v27.16b}, v24.16b + eor v0.16b, v0.16b, v24.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v24.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v24.16b, {v29.16b}, v24.16b + eor v0.16b, v0.16b, v24.16b + mov v13.16b,v0.16b + + // linear transformation + ushr v0.4s,v13.4s,32-2 + ushr v1.4s,v13.4s,32-10 + ushr v2.4s,v13.4s,32-18 + ushr v3.4s,v13.4s,32-24 + sli v0.4s,v13.4s,2 + sli v1.4s,v13.4s,10 + sli v2.4s,v13.4s,18 + sli v3.4s,v13.4s,24 + eor v24.16b,v0.16b,v13.16b + eor v24.16b,v24.16b,v1.16b + eor v13.16b,v2.16b,v3.16b + eor v13.16b,v13.16b,v24.16b + eor v7.16b,v7.16b,v13.16b + subs w11,w11,#1 + b.ne 10b +#ifndef __AARCH64EB__ + rev32 v3.16b,v4.16b +#else + mov v3.16b,v4.16b +#endif +#ifndef __AARCH64EB__ + rev32 v2.16b,v5.16b +#else + mov v2.16b,v5.16b +#endif +#ifndef __AARCH64EB__ + rev32 v1.16b,v6.16b +#else + mov v1.16b,v6.16b +#endif +#ifndef __AARCH64EB__ + rev32 v0.16b,v7.16b +#else + mov v0.16b,v7.16b +#endif + ret +.size _vpsm4_ex_enc_4blks,.-_vpsm4_ex_enc_4blks +.type _vpsm4_ex_enc_8blks,%function +.align 4 +_vpsm4_ex_enc_8blks: + AARCH64_VALID_CALL_TARGET + mov x10,x3 + mov w11,#8 +10: + ldp w7,w8,[x10],8 + // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) + dup v12.4s,w7 + eor v14.16b,v6.16b,v7.16b + eor v15.16b,v10.16b,v11.16b + eor v0.16b,v5.16b,v12.16b + eor v1.16b,v9.16b,v12.16b + eor v12.16b,v14.16b,v0.16b + eor v13.16b,v15.16b,v1.16b + // optimize sbox using AESE instruction + tbl v0.16b, {v12.16b}, v26.16b + tbl v1.16b, {v13.16b}, v26.16b + ushr v24.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v24.16b, {v27.16b}, v24.16b + eor v0.16b, v0.16b, v24.16b + ushr v24.16b, v1.16b, 4 + and v1.16b, v1.16b, v31.16b + tbl v1.16b, {v28.16b}, v1.16b + tbl v24.16b, {v27.16b}, v24.16b + eor v1.16b, v1.16b, v24.16b + eor v25.16b, v25.16b, v25.16b + aese v0.16b,v25.16b + aese v1.16b,v25.16b + ushr v24.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v24.16b, {v29.16b}, v24.16b + eor v0.16b, v0.16b, v24.16b + ushr v24.16b, v1.16b, 4 + and v1.16b, v1.16b, v31.16b + tbl v1.16b, {v30.16b}, v1.16b + tbl v24.16b, {v29.16b}, v24.16b + eor v1.16b, v1.16b, v24.16b + mov v12.16b,v0.16b + mov v13.16b,v1.16b + + // linear transformation + ushr v0.4s,v12.4s,32-2 + ushr v25.4s,v13.4s,32-2 + ushr v1.4s,v12.4s,32-10 + ushr v2.4s,v12.4s,32-18 + ushr v3.4s,v12.4s,32-24 + sli v0.4s,v12.4s,2 + sli v25.4s,v13.4s,2 + sli v1.4s,v12.4s,10 + sli v2.4s,v12.4s,18 + sli v3.4s,v12.4s,24 + eor v24.16b,v0.16b,v12.16b + eor v24.16b,v24.16b,v1.16b + eor v12.16b,v2.16b,v3.16b + eor v12.16b,v12.16b,v24.16b + ushr v1.4s,v13.4s,32-10 + ushr v2.4s,v13.4s,32-18 + ushr v3.4s,v13.4s,32-24 + sli v1.4s,v13.4s,10 + sli v2.4s,v13.4s,18 + sli v3.4s,v13.4s,24 + eor v24.16b,v25.16b,v13.16b + eor v24.16b,v24.16b,v1.16b + eor v13.16b,v2.16b,v3.16b + eor v13.16b,v13.16b,v24.16b + eor v4.16b,v4.16b,v12.16b + eor v8.16b,v8.16b,v13.16b + + // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) + dup v13.4s,w8 + eor v14.16b,v14.16b,v4.16b + eor v15.16b,v15.16b,v8.16b + eor v12.16b,v14.16b,v13.16b + eor v13.16b,v15.16b,v13.16b + // optimize sbox using AESE instruction + tbl v0.16b, {v12.16b}, v26.16b + tbl v1.16b, {v13.16b}, v26.16b + ushr v24.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v24.16b, {v27.16b}, v24.16b + eor v0.16b, v0.16b, v24.16b + ushr v24.16b, v1.16b, 4 + and v1.16b, v1.16b, v31.16b + tbl v1.16b, {v28.16b}, v1.16b + tbl v24.16b, {v27.16b}, v24.16b + eor v1.16b, v1.16b, v24.16b + eor v25.16b, v25.16b, v25.16b + aese v0.16b,v25.16b + aese v1.16b,v25.16b + ushr v24.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v24.16b, {v29.16b}, v24.16b + eor v0.16b, v0.16b, v24.16b + ushr v24.16b, v1.16b, 4 + and v1.16b, v1.16b, v31.16b + tbl v1.16b, {v30.16b}, v1.16b + tbl v24.16b, {v29.16b}, v24.16b + eor v1.16b, v1.16b, v24.16b + mov v12.16b,v0.16b + mov v13.16b,v1.16b + + // linear transformation + ushr v0.4s,v12.4s,32-2 + ushr v25.4s,v13.4s,32-2 + ushr v1.4s,v12.4s,32-10 + ushr v2.4s,v12.4s,32-18 + ushr v3.4s,v12.4s,32-24 + sli v0.4s,v12.4s,2 + sli v25.4s,v13.4s,2 + sli v1.4s,v12.4s,10 + sli v2.4s,v12.4s,18 + sli v3.4s,v12.4s,24 + eor v24.16b,v0.16b,v12.16b + eor v24.16b,v24.16b,v1.16b + eor v12.16b,v2.16b,v3.16b + eor v12.16b,v12.16b,v24.16b + ushr v1.4s,v13.4s,32-10 + ushr v2.4s,v13.4s,32-18 + ushr v3.4s,v13.4s,32-24 + sli v1.4s,v13.4s,10 + sli v2.4s,v13.4s,18 + sli v3.4s,v13.4s,24 + eor v24.16b,v25.16b,v13.16b + eor v24.16b,v24.16b,v1.16b + eor v13.16b,v2.16b,v3.16b + eor v13.16b,v13.16b,v24.16b + ldp w7,w8,[x10],8 + eor v5.16b,v5.16b,v12.16b + eor v9.16b,v9.16b,v13.16b + + // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) + dup v12.4s,w7 + eor v14.16b,v4.16b,v5.16b + eor v15.16b,v8.16b,v9.16b + eor v0.16b,v7.16b,v12.16b + eor v1.16b,v11.16b,v12.16b + eor v12.16b,v14.16b,v0.16b + eor v13.16b,v15.16b,v1.16b + // optimize sbox using AESE instruction + tbl v0.16b, {v12.16b}, v26.16b + tbl v1.16b, {v13.16b}, v26.16b + ushr v24.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v24.16b, {v27.16b}, v24.16b + eor v0.16b, v0.16b, v24.16b + ushr v24.16b, v1.16b, 4 + and v1.16b, v1.16b, v31.16b + tbl v1.16b, {v28.16b}, v1.16b + tbl v24.16b, {v27.16b}, v24.16b + eor v1.16b, v1.16b, v24.16b + eor v25.16b, v25.16b, v25.16b + aese v0.16b,v25.16b + aese v1.16b,v25.16b + ushr v24.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v24.16b, {v29.16b}, v24.16b + eor v0.16b, v0.16b, v24.16b + ushr v24.16b, v1.16b, 4 + and v1.16b, v1.16b, v31.16b + tbl v1.16b, {v30.16b}, v1.16b + tbl v24.16b, {v29.16b}, v24.16b + eor v1.16b, v1.16b, v24.16b + mov v12.16b,v0.16b + mov v13.16b,v1.16b + + // linear transformation + ushr v0.4s,v12.4s,32-2 + ushr v25.4s,v13.4s,32-2 + ushr v1.4s,v12.4s,32-10 + ushr v2.4s,v12.4s,32-18 + ushr v3.4s,v12.4s,32-24 + sli v0.4s,v12.4s,2 + sli v25.4s,v13.4s,2 + sli v1.4s,v12.4s,10 + sli v2.4s,v12.4s,18 + sli v3.4s,v12.4s,24 + eor v24.16b,v0.16b,v12.16b + eor v24.16b,v24.16b,v1.16b + eor v12.16b,v2.16b,v3.16b + eor v12.16b,v12.16b,v24.16b + ushr v1.4s,v13.4s,32-10 + ushr v2.4s,v13.4s,32-18 + ushr v3.4s,v13.4s,32-24 + sli v1.4s,v13.4s,10 + sli v2.4s,v13.4s,18 + sli v3.4s,v13.4s,24 + eor v24.16b,v25.16b,v13.16b + eor v24.16b,v24.16b,v1.16b + eor v13.16b,v2.16b,v3.16b + eor v13.16b,v13.16b,v24.16b + eor v6.16b,v6.16b,v12.16b + eor v10.16b,v10.16b,v13.16b + + // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) + dup v13.4s,w8 + eor v14.16b,v14.16b,v6.16b + eor v15.16b,v15.16b,v10.16b + eor v12.16b,v14.16b,v13.16b + eor v13.16b,v15.16b,v13.16b + // optimize sbox using AESE instruction + tbl v0.16b, {v12.16b}, v26.16b + tbl v1.16b, {v13.16b}, v26.16b + ushr v24.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v24.16b, {v27.16b}, v24.16b + eor v0.16b, v0.16b, v24.16b + ushr v24.16b, v1.16b, 4 + and v1.16b, v1.16b, v31.16b + tbl v1.16b, {v28.16b}, v1.16b + tbl v24.16b, {v27.16b}, v24.16b + eor v1.16b, v1.16b, v24.16b + eor v25.16b, v25.16b, v25.16b + aese v0.16b,v25.16b + aese v1.16b,v25.16b + ushr v24.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v24.16b, {v29.16b}, v24.16b + eor v0.16b, v0.16b, v24.16b + ushr v24.16b, v1.16b, 4 + and v1.16b, v1.16b, v31.16b + tbl v1.16b, {v30.16b}, v1.16b + tbl v24.16b, {v29.16b}, v24.16b + eor v1.16b, v1.16b, v24.16b + mov v12.16b,v0.16b + mov v13.16b,v1.16b + + // linear transformation + ushr v0.4s,v12.4s,32-2 + ushr v25.4s,v13.4s,32-2 + ushr v1.4s,v12.4s,32-10 + ushr v2.4s,v12.4s,32-18 + ushr v3.4s,v12.4s,32-24 + sli v0.4s,v12.4s,2 + sli v25.4s,v13.4s,2 + sli v1.4s,v12.4s,10 + sli v2.4s,v12.4s,18 + sli v3.4s,v12.4s,24 + eor v24.16b,v0.16b,v12.16b + eor v24.16b,v24.16b,v1.16b + eor v12.16b,v2.16b,v3.16b + eor v12.16b,v12.16b,v24.16b + ushr v1.4s,v13.4s,32-10 + ushr v2.4s,v13.4s,32-18 + ushr v3.4s,v13.4s,32-24 + sli v1.4s,v13.4s,10 + sli v2.4s,v13.4s,18 + sli v3.4s,v13.4s,24 + eor v24.16b,v25.16b,v13.16b + eor v24.16b,v24.16b,v1.16b + eor v13.16b,v2.16b,v3.16b + eor v13.16b,v13.16b,v24.16b + eor v7.16b,v7.16b,v12.16b + eor v11.16b,v11.16b,v13.16b + subs w11,w11,#1 + b.ne 10b +#ifndef __AARCH64EB__ + rev32 v3.16b,v4.16b +#else + mov v3.16b,v4.16b +#endif +#ifndef __AARCH64EB__ + rev32 v2.16b,v5.16b +#else + mov v2.16b,v5.16b +#endif +#ifndef __AARCH64EB__ + rev32 v1.16b,v6.16b +#else + mov v1.16b,v6.16b +#endif +#ifndef __AARCH64EB__ + rev32 v0.16b,v7.16b +#else + mov v0.16b,v7.16b +#endif +#ifndef __AARCH64EB__ + rev32 v7.16b,v8.16b +#else + mov v7.16b,v8.16b +#endif +#ifndef __AARCH64EB__ + rev32 v6.16b,v9.16b +#else + mov v6.16b,v9.16b +#endif +#ifndef __AARCH64EB__ + rev32 v5.16b,v10.16b +#else + mov v5.16b,v10.16b +#endif +#ifndef __AARCH64EB__ + rev32 v4.16b,v11.16b +#else + mov v4.16b,v11.16b +#endif + ret +.size _vpsm4_ex_enc_8blks,.-_vpsm4_ex_enc_8blks +.globl vpsm4_ex_set_encrypt_key +.type vpsm4_ex_set_encrypt_key,%function +.align 5 +vpsm4_ex_set_encrypt_key: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + mov w2,1 + bl _vpsm4_ex_set_key + ldp x29,x30,[sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size vpsm4_ex_set_encrypt_key,.-vpsm4_ex_set_encrypt_key +.globl vpsm4_ex_set_decrypt_key +.type vpsm4_ex_set_decrypt_key,%function +.align 5 +vpsm4_ex_set_decrypt_key: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + mov w2,0 + bl _vpsm4_ex_set_key + ldp x29,x30,[sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size vpsm4_ex_set_decrypt_key,.-vpsm4_ex_set_decrypt_key +.globl vpsm4_ex_encrypt +.type vpsm4_ex_encrypt,%function +.align 5 +vpsm4_ex_encrypt: + AARCH64_VALID_CALL_TARGET + ld1 {v4.4s},[x0] + adrp x9, .Lsbox_magic + ldr q26, [x9, #:lo12:.Lsbox_magic] + ldr q27, [x9, #:lo12:.Lsbox_magic+16] + ldr q28, [x9, #:lo12:.Lsbox_magic+32] + ldr q29, [x9, #:lo12:.Lsbox_magic+48] + ldr q30, [x9, #:lo12:.Lsbox_magic+64] + ldr q31, [x9, #:lo12:.Lsbox_magic+80] +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif + mov x3,x2 + mov x10,x3 + mov w11,#8 + mov w12,v4.s[0] + mov w13,v4.s[1] + mov w14,v4.s[2] + mov w15,v4.s[3] +10: + ldp w7,w8,[x10],8 + // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) + eor w6,w14,w15 + eor w9,w7,w13 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w12,w12,w6 + // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) + eor w6,w14,w15 + eor w9,w12,w8 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + ldp w7,w8,[x10],8 + eor w13,w13,w6 + // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) + eor w6,w12,w13 + eor w9,w7,w15 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w14,w14,w6 + // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) + eor w6,w12,w13 + eor w9,w14,w8 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w15,w15,w6 + subs w11,w11,#1 + b.ne 10b + mov v4.s[0],w15 + mov v4.s[1],w14 + mov v4.s[2],w13 + mov v4.s[3],w12 +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif + st1 {v4.4s},[x1] + ret +.size vpsm4_ex_encrypt,.-vpsm4_ex_encrypt +.globl vpsm4_ex_decrypt +.type vpsm4_ex_decrypt,%function +.align 5 +vpsm4_ex_decrypt: + AARCH64_VALID_CALL_TARGET + ld1 {v4.4s},[x0] + adrp x9, .Lsbox_magic + ldr q26, [x9, #:lo12:.Lsbox_magic] + ldr q27, [x9, #:lo12:.Lsbox_magic+16] + ldr q28, [x9, #:lo12:.Lsbox_magic+32] + ldr q29, [x9, #:lo12:.Lsbox_magic+48] + ldr q30, [x9, #:lo12:.Lsbox_magic+64] + ldr q31, [x9, #:lo12:.Lsbox_magic+80] +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif + mov x3,x2 + mov x10,x3 + mov w11,#8 + mov w12,v4.s[0] + mov w13,v4.s[1] + mov w14,v4.s[2] + mov w15,v4.s[3] +10: + ldp w7,w8,[x10],8 + // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) + eor w6,w14,w15 + eor w9,w7,w13 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w12,w12,w6 + // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) + eor w6,w14,w15 + eor w9,w12,w8 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + ldp w7,w8,[x10],8 + eor w13,w13,w6 + // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) + eor w6,w12,w13 + eor w9,w7,w15 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w14,w14,w6 + // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) + eor w6,w12,w13 + eor w9,w14,w8 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w15,w15,w6 + subs w11,w11,#1 + b.ne 10b + mov v4.s[0],w15 + mov v4.s[1],w14 + mov v4.s[2],w13 + mov v4.s[3],w12 +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif + st1 {v4.4s},[x1] + ret +.size vpsm4_ex_decrypt,.-vpsm4_ex_decrypt +.globl vpsm4_ex_ecb_encrypt +.type vpsm4_ex_ecb_encrypt,%function +.align 5 +vpsm4_ex_ecb_encrypt: + AARCH64_SIGN_LINK_REGISTER + // convert length into blocks + lsr x2,x2,4 + stp d8,d9,[sp,#-80]! + stp d10,d11,[sp,#16] + stp d12,d13,[sp,#32] + stp d14,d15,[sp,#48] + stp x29,x30,[sp,#64] + adrp x9, .Lsbox_magic + ldr q26, [x9, #:lo12:.Lsbox_magic] + ldr q27, [x9, #:lo12:.Lsbox_magic+16] + ldr q28, [x9, #:lo12:.Lsbox_magic+32] + ldr q29, [x9, #:lo12:.Lsbox_magic+48] + ldr q30, [x9, #:lo12:.Lsbox_magic+64] + ldr q31, [x9, #:lo12:.Lsbox_magic+80] +.Lecb_8_blocks_process: + cmp w2,#8 + b.lt .Lecb_4_blocks_process + ld4 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64 + ld4 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64 +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif +#ifndef __AARCH64EB__ + rev32 v5.16b,v5.16b +#endif +#ifndef __AARCH64EB__ + rev32 v6.16b,v6.16b +#endif +#ifndef __AARCH64EB__ + rev32 v7.16b,v7.16b +#endif +#ifndef __AARCH64EB__ + rev32 v8.16b,v8.16b +#endif +#ifndef __AARCH64EB__ + rev32 v9.16b,v9.16b +#endif +#ifndef __AARCH64EB__ + rev32 v10.16b,v10.16b +#endif +#ifndef __AARCH64EB__ + rev32 v11.16b,v11.16b +#endif + bl _vpsm4_ex_enc_8blks + st4 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64 + st4 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64 + subs w2,w2,#8 + b.gt .Lecb_8_blocks_process + b 100f +.Lecb_4_blocks_process: + cmp w2,#4 + b.lt 1f + ld4 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64 +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif +#ifndef __AARCH64EB__ + rev32 v5.16b,v5.16b +#endif +#ifndef __AARCH64EB__ + rev32 v6.16b,v6.16b +#endif +#ifndef __AARCH64EB__ + rev32 v7.16b,v7.16b +#endif + bl _vpsm4_ex_enc_4blks + st4 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64 + sub w2,w2,#4 +1: + // process last block + cmp w2,#1 + b.lt 100f + b.gt 1f + ld1 {v4.4s},[x0] +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif + mov x10,x3 + mov w11,#8 + mov w12,v4.s[0] + mov w13,v4.s[1] + mov w14,v4.s[2] + mov w15,v4.s[3] +10: + ldp w7,w8,[x10],8 + // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) + eor w6,w14,w15 + eor w9,w7,w13 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w12,w12,w6 + // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) + eor w6,w14,w15 + eor w9,w12,w8 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + ldp w7,w8,[x10],8 + eor w13,w13,w6 + // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) + eor w6,w12,w13 + eor w9,w7,w15 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w14,w14,w6 + // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) + eor w6,w12,w13 + eor w9,w14,w8 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w15,w15,w6 + subs w11,w11,#1 + b.ne 10b + mov v4.s[0],w15 + mov v4.s[1],w14 + mov v4.s[2],w13 + mov v4.s[3],w12 +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif + st1 {v4.4s},[x1] + b 100f +1: // process last 2 blocks + ld4 {v4.s,v5.s,v6.s,v7.s}[0],[x0],#16 + ld4 {v4.s,v5.s,v6.s,v7.s}[1],[x0],#16 + cmp w2,#2 + b.gt 1f +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif +#ifndef __AARCH64EB__ + rev32 v5.16b,v5.16b +#endif +#ifndef __AARCH64EB__ + rev32 v6.16b,v6.16b +#endif +#ifndef __AARCH64EB__ + rev32 v7.16b,v7.16b +#endif + bl _vpsm4_ex_enc_4blks + st4 {v0.s,v1.s,v2.s,v3.s}[0],[x1],#16 + st4 {v0.s,v1.s,v2.s,v3.s}[1],[x1] + b 100f +1: // process last 3 blocks + ld4 {v4.s,v5.s,v6.s,v7.s}[2],[x0],#16 +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif +#ifndef __AARCH64EB__ + rev32 v5.16b,v5.16b +#endif +#ifndef __AARCH64EB__ + rev32 v6.16b,v6.16b +#endif +#ifndef __AARCH64EB__ + rev32 v7.16b,v7.16b +#endif + bl _vpsm4_ex_enc_4blks + st4 {v0.s,v1.s,v2.s,v3.s}[0],[x1],#16 + st4 {v0.s,v1.s,v2.s,v3.s}[1],[x1],#16 + st4 {v0.s,v1.s,v2.s,v3.s}[2],[x1] +100: + ldp d10,d11,[sp,#16] + ldp d12,d13,[sp,#32] + ldp d14,d15,[sp,#48] + ldp x29,x30,[sp,#64] + ldp d8,d9,[sp],#80 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size vpsm4_ex_ecb_encrypt,.-vpsm4_ex_ecb_encrypt +.globl vpsm4_ex_cbc_encrypt +.type vpsm4_ex_cbc_encrypt,%function +.align 5 +vpsm4_ex_cbc_encrypt: + AARCH64_VALID_CALL_TARGET + lsr x2,x2,4 + adrp x9, .Lsbox_magic + ldr q26, [x9, #:lo12:.Lsbox_magic] + ldr q27, [x9, #:lo12:.Lsbox_magic+16] + ldr q28, [x9, #:lo12:.Lsbox_magic+32] + ldr q29, [x9, #:lo12:.Lsbox_magic+48] + ldr q30, [x9, #:lo12:.Lsbox_magic+64] + ldr q31, [x9, #:lo12:.Lsbox_magic+80] + cbz w5,.Ldec + ld1 {v3.4s},[x4] +.Lcbc_4_blocks_enc: + cmp w2,#4 + b.lt 1f + ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64 + eor v4.16b,v4.16b,v3.16b +#ifndef __AARCH64EB__ + rev32 v5.16b,v5.16b +#endif +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif +#ifndef __AARCH64EB__ + rev32 v6.16b,v6.16b +#endif +#ifndef __AARCH64EB__ + rev32 v7.16b,v7.16b +#endif + mov x10,x3 + mov w11,#8 + mov w12,v4.s[0] + mov w13,v4.s[1] + mov w14,v4.s[2] + mov w15,v4.s[3] +10: + ldp w7,w8,[x10],8 + // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) + eor w6,w14,w15 + eor w9,w7,w13 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w12,w12,w6 + // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) + eor w6,w14,w15 + eor w9,w12,w8 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + ldp w7,w8,[x10],8 + eor w13,w13,w6 + // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) + eor w6,w12,w13 + eor w9,w7,w15 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w14,w14,w6 + // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) + eor w6,w12,w13 + eor w9,w14,w8 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w15,w15,w6 + subs w11,w11,#1 + b.ne 10b + mov v4.s[0],w15 + mov v4.s[1],w14 + mov v4.s[2],w13 + mov v4.s[3],w12 + eor v5.16b,v5.16b,v4.16b + mov x10,x3 + mov w11,#8 + mov w12,v5.s[0] + mov w13,v5.s[1] + mov w14,v5.s[2] + mov w15,v5.s[3] +10: + ldp w7,w8,[x10],8 + // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) + eor w6,w14,w15 + eor w9,w7,w13 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w12,w12,w6 + // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) + eor w6,w14,w15 + eor w9,w12,w8 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + ldp w7,w8,[x10],8 + eor w13,w13,w6 + // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) + eor w6,w12,w13 + eor w9,w7,w15 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w14,w14,w6 + // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) + eor w6,w12,w13 + eor w9,w14,w8 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w15,w15,w6 + subs w11,w11,#1 + b.ne 10b + mov v5.s[0],w15 + mov v5.s[1],w14 + mov v5.s[2],w13 + mov v5.s[3],w12 +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif + eor v6.16b,v6.16b,v5.16b + mov x10,x3 + mov w11,#8 + mov w12,v6.s[0] + mov w13,v6.s[1] + mov w14,v6.s[2] + mov w15,v6.s[3] +10: + ldp w7,w8,[x10],8 + // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) + eor w6,w14,w15 + eor w9,w7,w13 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w12,w12,w6 + // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) + eor w6,w14,w15 + eor w9,w12,w8 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + ldp w7,w8,[x10],8 + eor w13,w13,w6 + // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) + eor w6,w12,w13 + eor w9,w7,w15 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w14,w14,w6 + // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) + eor w6,w12,w13 + eor w9,w14,w8 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w15,w15,w6 + subs w11,w11,#1 + b.ne 10b + mov v6.s[0],w15 + mov v6.s[1],w14 + mov v6.s[2],w13 + mov v6.s[3],w12 +#ifndef __AARCH64EB__ + rev32 v5.16b,v5.16b +#endif + eor v7.16b,v7.16b,v6.16b + mov x10,x3 + mov w11,#8 + mov w12,v7.s[0] + mov w13,v7.s[1] + mov w14,v7.s[2] + mov w15,v7.s[3] +10: + ldp w7,w8,[x10],8 + // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) + eor w6,w14,w15 + eor w9,w7,w13 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w12,w12,w6 + // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) + eor w6,w14,w15 + eor w9,w12,w8 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + ldp w7,w8,[x10],8 + eor w13,w13,w6 + // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) + eor w6,w12,w13 + eor w9,w7,w15 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w14,w14,w6 + // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) + eor w6,w12,w13 + eor w9,w14,w8 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w15,w15,w6 + subs w11,w11,#1 + b.ne 10b + mov v7.s[0],w15 + mov v7.s[1],w14 + mov v7.s[2],w13 + mov v7.s[3],w12 +#ifndef __AARCH64EB__ + rev32 v6.16b,v6.16b +#endif +#ifndef __AARCH64EB__ + rev32 v7.16b,v7.16b +#endif + orr v3.16b,v7.16b,v7.16b + st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64 + subs w2,w2,#4 + b.ne .Lcbc_4_blocks_enc + b 2f +1: + subs w2,w2,#1 + b.lt 2f + ld1 {v4.4s},[x0],#16 + eor v3.16b,v3.16b,v4.16b +#ifndef __AARCH64EB__ + rev32 v3.16b,v3.16b +#endif + mov x10,x3 + mov w11,#8 + mov w12,v3.s[0] + mov w13,v3.s[1] + mov w14,v3.s[2] + mov w15,v3.s[3] +10: + ldp w7,w8,[x10],8 + // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) + eor w6,w14,w15 + eor w9,w7,w13 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w12,w12,w6 + // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) + eor w6,w14,w15 + eor w9,w12,w8 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + ldp w7,w8,[x10],8 + eor w13,w13,w6 + // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) + eor w6,w12,w13 + eor w9,w7,w15 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w14,w14,w6 + // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) + eor w6,w12,w13 + eor w9,w14,w8 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w15,w15,w6 + subs w11,w11,#1 + b.ne 10b + mov v3.s[0],w15 + mov v3.s[1],w14 + mov v3.s[2],w13 + mov v3.s[3],w12 +#ifndef __AARCH64EB__ + rev32 v3.16b,v3.16b +#endif + st1 {v3.4s},[x1],#16 + b 1b +2: + // save back IV + st1 {v3.4s},[x4] + ret + +.Ldec: + // decryption mode starts + AARCH64_SIGN_LINK_REGISTER + stp d8,d9,[sp,#-80]! + stp d10,d11,[sp,#16] + stp d12,d13,[sp,#32] + stp d14,d15,[sp,#48] + stp x29,x30,[sp,#64] +.Lcbc_8_blocks_dec: + cmp w2,#8 + b.lt 1f + ld4 {v4.4s,v5.4s,v6.4s,v7.4s},[x0] + add x10,x0,#64 + ld4 {v8.4s,v9.4s,v10.4s,v11.4s},[x10] +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif +#ifndef __AARCH64EB__ + rev32 v5.16b,v5.16b +#endif +#ifndef __AARCH64EB__ + rev32 v6.16b,v6.16b +#endif +#ifndef __AARCH64EB__ + rev32 v7.16b,v7.16b +#endif +#ifndef __AARCH64EB__ + rev32 v8.16b,v8.16b +#endif +#ifndef __AARCH64EB__ + rev32 v9.16b,v9.16b +#endif +#ifndef __AARCH64EB__ + rev32 v10.16b,v10.16b +#endif +#ifndef __AARCH64EB__ + rev32 v11.16b,v11.16b +#endif + bl _vpsm4_ex_enc_8blks + zip1 v8.4s,v0.4s,v1.4s + zip2 v9.4s,v0.4s,v1.4s + zip1 v10.4s,v2.4s,v3.4s + zip2 v11.4s,v2.4s,v3.4s + zip1 v0.2d,v8.2d,v10.2d + zip2 v1.2d,v8.2d,v10.2d + zip1 v2.2d,v9.2d,v11.2d + zip2 v3.2d,v9.2d,v11.2d + zip1 v8.4s,v4.4s,v5.4s + zip2 v9.4s,v4.4s,v5.4s + zip1 v10.4s,v6.4s,v7.4s + zip2 v11.4s,v6.4s,v7.4s + zip1 v4.2d,v8.2d,v10.2d + zip2 v5.2d,v8.2d,v10.2d + zip1 v6.2d,v9.2d,v11.2d + zip2 v7.2d,v9.2d,v11.2d + ld1 {v15.4s},[x4] + ld1 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64 + // note ivec1 and vtmpx[3] are reusing the same register + // care needs to be taken to avoid conflict + eor v0.16b,v0.16b,v15.16b + ld1 {v12.4s,v13.4s,v14.4s,v15.4s},[x0],#64 + eor v1.16b,v1.16b,v8.16b + eor v2.16b,v2.16b,v9.16b + eor v3.16b,v3.16b,v10.16b + // save back IV + st1 {v15.4s}, [x4] + eor v4.16b,v4.16b,v11.16b + eor v5.16b,v5.16b,v12.16b + eor v6.16b,v6.16b,v13.16b + eor v7.16b,v7.16b,v14.16b + st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64 + st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64 + subs w2,w2,#8 + b.gt .Lcbc_8_blocks_dec + b.eq 100f +1: + ld1 {v15.4s},[x4] +.Lcbc_4_blocks_dec: + cmp w2,#4 + b.lt 1f + ld4 {v4.4s,v5.4s,v6.4s,v7.4s},[x0] +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif +#ifndef __AARCH64EB__ + rev32 v5.16b,v5.16b +#endif +#ifndef __AARCH64EB__ + rev32 v6.16b,v6.16b +#endif +#ifndef __AARCH64EB__ + rev32 v7.16b,v7.16b +#endif + bl _vpsm4_ex_enc_4blks + ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64 + zip1 v8.4s,v0.4s,v1.4s + zip2 v9.4s,v0.4s,v1.4s + zip1 v10.4s,v2.4s,v3.4s + zip2 v11.4s,v2.4s,v3.4s + zip1 v0.2d,v8.2d,v10.2d + zip2 v1.2d,v8.2d,v10.2d + zip1 v2.2d,v9.2d,v11.2d + zip2 v3.2d,v9.2d,v11.2d + eor v0.16b,v0.16b,v15.16b + eor v1.16b,v1.16b,v4.16b + orr v15.16b,v7.16b,v7.16b + eor v2.16b,v2.16b,v5.16b + eor v3.16b,v3.16b,v6.16b + st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64 + subs w2,w2,#4 + b.gt .Lcbc_4_blocks_dec + // save back IV + st1 {v7.4s}, [x4] + b 100f +1: // last block + subs w2,w2,#1 + b.lt 100f + b.gt 1f + ld1 {v4.4s},[x0],#16 + // save back IV + st1 {v4.4s}, [x4] +#ifndef __AARCH64EB__ + rev32 v8.16b,v4.16b +#else + mov v8.16b,v4.16b +#endif + mov x10,x3 + mov w11,#8 + mov w12,v8.s[0] + mov w13,v8.s[1] + mov w14,v8.s[2] + mov w15,v8.s[3] +10: + ldp w7,w8,[x10],8 + // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) + eor w6,w14,w15 + eor w9,w7,w13 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w12,w12,w6 + // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) + eor w6,w14,w15 + eor w9,w12,w8 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + ldp w7,w8,[x10],8 + eor w13,w13,w6 + // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) + eor w6,w12,w13 + eor w9,w7,w15 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w14,w14,w6 + // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) + eor w6,w12,w13 + eor w9,w14,w8 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w15,w15,w6 + subs w11,w11,#1 + b.ne 10b + mov v8.s[0],w15 + mov v8.s[1],w14 + mov v8.s[2],w13 + mov v8.s[3],w12 +#ifndef __AARCH64EB__ + rev32 v8.16b,v8.16b +#endif + eor v8.16b,v8.16b,v15.16b + st1 {v8.4s},[x1],#16 + b 100f +1: // last two blocks + ld4 {v4.s,v5.s,v6.s,v7.s}[0],[x0] + add x10,x0,#16 + ld4 {v4.s,v5.s,v6.s,v7.s}[1],[x10],#16 + subs w2,w2,1 + b.gt 1f +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif +#ifndef __AARCH64EB__ + rev32 v5.16b,v5.16b +#endif +#ifndef __AARCH64EB__ + rev32 v6.16b,v6.16b +#endif +#ifndef __AARCH64EB__ + rev32 v7.16b,v7.16b +#endif + bl _vpsm4_ex_enc_4blks + ld1 {v4.4s,v5.4s},[x0],#32 + zip1 v8.4s,v0.4s,v1.4s + zip2 v9.4s,v0.4s,v1.4s + zip1 v10.4s,v2.4s,v3.4s + zip2 v11.4s,v2.4s,v3.4s + zip1 v0.2d,v8.2d,v10.2d + zip2 v1.2d,v8.2d,v10.2d + zip1 v2.2d,v9.2d,v11.2d + zip2 v3.2d,v9.2d,v11.2d + eor v0.16b,v0.16b,v15.16b + eor v1.16b,v1.16b,v4.16b + st1 {v0.4s,v1.4s},[x1],#32 + // save back IV + st1 {v5.4s}, [x4] + b 100f +1: // last 3 blocks + ld4 {v4.s,v5.s,v6.s,v7.s}[2],[x10] +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif +#ifndef __AARCH64EB__ + rev32 v5.16b,v5.16b +#endif +#ifndef __AARCH64EB__ + rev32 v6.16b,v6.16b +#endif +#ifndef __AARCH64EB__ + rev32 v7.16b,v7.16b +#endif + bl _vpsm4_ex_enc_4blks + ld1 {v4.4s,v5.4s,v6.4s},[x0],#48 + zip1 v8.4s,v0.4s,v1.4s + zip2 v9.4s,v0.4s,v1.4s + zip1 v10.4s,v2.4s,v3.4s + zip2 v11.4s,v2.4s,v3.4s + zip1 v0.2d,v8.2d,v10.2d + zip2 v1.2d,v8.2d,v10.2d + zip1 v2.2d,v9.2d,v11.2d + zip2 v3.2d,v9.2d,v11.2d + eor v0.16b,v0.16b,v15.16b + eor v1.16b,v1.16b,v4.16b + eor v2.16b,v2.16b,v5.16b + st1 {v0.4s,v1.4s,v2.4s},[x1],#48 + // save back IV + st1 {v6.4s}, [x4] +100: + ldp d10,d11,[sp,#16] + ldp d12,d13,[sp,#32] + ldp d14,d15,[sp,#48] + ldp x29,x30,[sp,#64] + ldp d8,d9,[sp],#80 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size vpsm4_ex_cbc_encrypt,.-vpsm4_ex_cbc_encrypt +.globl vpsm4_ex_ctr32_encrypt_blocks +.type vpsm4_ex_ctr32_encrypt_blocks,%function +.align 5 +vpsm4_ex_ctr32_encrypt_blocks: + AARCH64_VALID_CALL_TARGET + ld1 {v3.4s},[x4] +#ifndef __AARCH64EB__ + rev32 v3.16b,v3.16b +#endif + adrp x9, .Lsbox_magic + ldr q26, [x9, #:lo12:.Lsbox_magic] + ldr q27, [x9, #:lo12:.Lsbox_magic+16] + ldr q28, [x9, #:lo12:.Lsbox_magic+32] + ldr q29, [x9, #:lo12:.Lsbox_magic+48] + ldr q30, [x9, #:lo12:.Lsbox_magic+64] + ldr q31, [x9, #:lo12:.Lsbox_magic+80] + cmp w2,#1 + b.ne 1f + // fast processing for one single block without + // context saving overhead + mov x10,x3 + mov w11,#8 + mov w12,v3.s[0] + mov w13,v3.s[1] + mov w14,v3.s[2] + mov w15,v3.s[3] +10: + ldp w7,w8,[x10],8 + // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) + eor w6,w14,w15 + eor w9,w7,w13 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w12,w12,w6 + // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) + eor w6,w14,w15 + eor w9,w12,w8 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + ldp w7,w8,[x10],8 + eor w13,w13,w6 + // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) + eor w6,w12,w13 + eor w9,w7,w15 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w14,w14,w6 + // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) + eor w6,w12,w13 + eor w9,w14,w8 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w15,w15,w6 + subs w11,w11,#1 + b.ne 10b + mov v3.s[0],w15 + mov v3.s[1],w14 + mov v3.s[2],w13 + mov v3.s[3],w12 +#ifndef __AARCH64EB__ + rev32 v3.16b,v3.16b +#endif + ld1 {v4.4s},[x0] + eor v4.16b,v4.16b,v3.16b + st1 {v4.4s},[x1] + ret +1: + AARCH64_SIGN_LINK_REGISTER + stp d8,d9,[sp,#-80]! + stp d10,d11,[sp,#16] + stp d12,d13,[sp,#32] + stp d14,d15,[sp,#48] + stp x29,x30,[sp,#64] + mov w12,v3.s[0] + mov w13,v3.s[1] + mov w14,v3.s[2] + mov w5,v3.s[3] +.Lctr32_4_blocks_process: + cmp w2,#4 + b.lt 1f + dup v4.4s,w12 + dup v5.4s,w13 + dup v6.4s,w14 + mov v7.s[0],w5 + add w5,w5,#1 + mov v7.s[1],w5 + add w5,w5,#1 + mov v7.s[2],w5 + add w5,w5,#1 + mov v7.s[3],w5 + add w5,w5,#1 + cmp w2,#8 + b.ge .Lctr32_8_blocks_process + bl _vpsm4_ex_enc_4blks + ld4 {v12.4s,v13.4s,v14.4s,v15.4s},[x0],#64 + eor v0.16b,v0.16b,v12.16b + eor v1.16b,v1.16b,v13.16b + eor v2.16b,v2.16b,v14.16b + eor v3.16b,v3.16b,v15.16b + st4 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64 + subs w2,w2,#4 + b.ne .Lctr32_4_blocks_process + b 100f +.Lctr32_8_blocks_process: + dup v8.4s,w12 + dup v9.4s,w13 + dup v10.4s,w14 + mov v11.s[0],w5 + add w5,w5,#1 + mov v11.s[1],w5 + add w5,w5,#1 + mov v11.s[2],w5 + add w5,w5,#1 + mov v11.s[3],w5 + add w5,w5,#1 + bl _vpsm4_ex_enc_8blks + ld4 {v12.4s,v13.4s,v14.4s,v15.4s},[x0],#64 + ld4 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64 + eor v0.16b,v0.16b,v12.16b + eor v1.16b,v1.16b,v13.16b + eor v2.16b,v2.16b,v14.16b + eor v3.16b,v3.16b,v15.16b + eor v4.16b,v4.16b,v8.16b + eor v5.16b,v5.16b,v9.16b + eor v6.16b,v6.16b,v10.16b + eor v7.16b,v7.16b,v11.16b + st4 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64 + st4 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64 + subs w2,w2,#8 + b.ne .Lctr32_4_blocks_process + b 100f +1: // last block processing + subs w2,w2,#1 + b.lt 100f + b.gt 1f + mov v3.s[0],w12 + mov v3.s[1],w13 + mov v3.s[2],w14 + mov v3.s[3],w5 + mov x10,x3 + mov w11,#8 + mov w12,v3.s[0] + mov w13,v3.s[1] + mov w14,v3.s[2] + mov w15,v3.s[3] +10: + ldp w7,w8,[x10],8 + // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) + eor w6,w14,w15 + eor w9,w7,w13 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w12,w12,w6 + // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) + eor w6,w14,w15 + eor w9,w12,w8 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + ldp w7,w8,[x10],8 + eor w13,w13,w6 + // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) + eor w6,w12,w13 + eor w9,w7,w15 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w14,w14,w6 + // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) + eor w6,w12,w13 + eor w9,w14,w8 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w15,w15,w6 + subs w11,w11,#1 + b.ne 10b + mov v3.s[0],w15 + mov v3.s[1],w14 + mov v3.s[2],w13 + mov v3.s[3],w12 +#ifndef __AARCH64EB__ + rev32 v3.16b,v3.16b +#endif + ld1 {v4.4s},[x0] + eor v4.16b,v4.16b,v3.16b + st1 {v4.4s},[x1] + b 100f +1: // last 2 blocks processing + dup v4.4s,w12 + dup v5.4s,w13 + dup v6.4s,w14 + mov v7.s[0],w5 + add w5,w5,#1 + mov v7.s[1],w5 + subs w2,w2,#1 + b.ne 1f + bl _vpsm4_ex_enc_4blks + ld4 {v12.s,v13.s,v14.s,v15.s}[0],[x0],#16 + ld4 {v12.s,v13.s,v14.s,v15.s}[1],[x0],#16 + eor v0.16b,v0.16b,v12.16b + eor v1.16b,v1.16b,v13.16b + eor v2.16b,v2.16b,v14.16b + eor v3.16b,v3.16b,v15.16b + st4 {v0.s,v1.s,v2.s,v3.s}[0],[x1],#16 + st4 {v0.s,v1.s,v2.s,v3.s}[1],[x1],#16 + b 100f +1: // last 3 blocks processing + add w5,w5,#1 + mov v7.s[2],w5 + bl _vpsm4_ex_enc_4blks + ld4 {v12.s,v13.s,v14.s,v15.s}[0],[x0],#16 + ld4 {v12.s,v13.s,v14.s,v15.s}[1],[x0],#16 + ld4 {v12.s,v13.s,v14.s,v15.s}[2],[x0],#16 + eor v0.16b,v0.16b,v12.16b + eor v1.16b,v1.16b,v13.16b + eor v2.16b,v2.16b,v14.16b + eor v3.16b,v3.16b,v15.16b + st4 {v0.s,v1.s,v2.s,v3.s}[0],[x1],#16 + st4 {v0.s,v1.s,v2.s,v3.s}[1],[x1],#16 + st4 {v0.s,v1.s,v2.s,v3.s}[2],[x1],#16 +100: + ldp d10,d11,[sp,#16] + ldp d12,d13,[sp,#32] + ldp d14,d15,[sp,#48] + ldp x29,x30,[sp,#64] + ldp d8,d9,[sp],#80 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size vpsm4_ex_ctr32_encrypt_blocks,.-vpsm4_ex_ctr32_encrypt_blocks +.globl vpsm4_ex_xts_encrypt_gb +.type vpsm4_ex_xts_encrypt_gb,%function +.align 5 +vpsm4_ex_xts_encrypt_gb: + AARCH64_SIGN_LINK_REGISTER + stp x15, x16, [sp, #-0x10]! + stp x17, x18, [sp, #-0x10]! + stp x19, x20, [sp, #-0x10]! + stp x21, x22, [sp, #-0x10]! + stp x23, x24, [sp, #-0x10]! + stp x25, x26, [sp, #-0x10]! + stp x27, x28, [sp, #-0x10]! + stp x29, x30, [sp, #-0x10]! + stp d8, d9, [sp, #-0x10]! + stp d10, d11, [sp, #-0x10]! + stp d12, d13, [sp, #-0x10]! + stp d14, d15, [sp, #-0x10]! + mov x26,x3 + mov x27,x4 + mov w28,w6 + ld1 {v16.4s}, [x5] + mov x3,x27 + adrp x9, .Lsbox_magic + ldr q26, [x9, #:lo12:.Lsbox_magic] + ldr q27, [x9, #:lo12:.Lsbox_magic+16] + ldr q28, [x9, #:lo12:.Lsbox_magic+32] + ldr q29, [x9, #:lo12:.Lsbox_magic+48] + ldr q30, [x9, #:lo12:.Lsbox_magic+64] + ldr q31, [x9, #:lo12:.Lsbox_magic+80] +#ifndef __AARCH64EB__ + rev32 v16.16b,v16.16b +#endif + mov x10,x3 + mov w11,#8 + mov w12,v16.s[0] + mov w13,v16.s[1] + mov w14,v16.s[2] + mov w15,v16.s[3] +10: + ldp w7,w8,[x10],8 + // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) + eor w6,w14,w15 + eor w9,w7,w13 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w12,w12,w6 + // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) + eor w6,w14,w15 + eor w9,w12,w8 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + ldp w7,w8,[x10],8 + eor w13,w13,w6 + // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) + eor w6,w12,w13 + eor w9,w7,w15 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w14,w14,w6 + // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) + eor w6,w12,w13 + eor w9,w14,w8 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w15,w15,w6 + subs w11,w11,#1 + b.ne 10b + mov v16.s[0],w15 + mov v16.s[1],w14 + mov v16.s[2],w13 + mov v16.s[3],w12 +#ifndef __AARCH64EB__ + rev32 v16.16b,v16.16b +#endif + mov x3,x26 + and x29,x2,#0x0F + // convert length into blocks + lsr x2,x2,4 + cmp x2,#1 + b.lt .return_gb + + cmp x29,0 + // If the encryption/decryption Length is N times of 16, + // the all blocks are encrypted/decrypted in .xts_encrypt_blocks_gb + b.eq .xts_encrypt_blocks_gb + + // If the encryption/decryption length is not N times of 16, + // the last two blocks are encrypted/decrypted in .last_2blks_tweak_gb or .only_2blks_tweak_gb + // the other blocks are encrypted/decrypted in .xts_encrypt_blocks_gb + subs x2,x2,#1 + b.eq .only_2blks_tweak_gb +.xts_encrypt_blocks_gb: + rbit v16.16b,v16.16b +#ifdef __AARCH64EB__ + rev32 v16.16b,v16.16b +#endif + mov x12,v16.d[0] + mov x13,v16.d[1] + mov w7,0x87 + extr x9,x13,x13,#32 + extr x15,x13,x12,#63 + and w8,w7,w9,asr#31 + eor x14,x8,x12,lsl#1 + mov w7,0x87 + extr x9,x15,x15,#32 + extr x17,x15,x14,#63 + and w8,w7,w9,asr#31 + eor x16,x8,x14,lsl#1 + mov w7,0x87 + extr x9,x17,x17,#32 + extr x19,x17,x16,#63 + and w8,w7,w9,asr#31 + eor x18,x8,x16,lsl#1 + mov w7,0x87 + extr x9,x19,x19,#32 + extr x21,x19,x18,#63 + and w8,w7,w9,asr#31 + eor x20,x8,x18,lsl#1 + mov w7,0x87 + extr x9,x21,x21,#32 + extr x23,x21,x20,#63 + and w8,w7,w9,asr#31 + eor x22,x8,x20,lsl#1 + mov w7,0x87 + extr x9,x23,x23,#32 + extr x25,x23,x22,#63 + and w8,w7,w9,asr#31 + eor x24,x8,x22,lsl#1 + mov w7,0x87 + extr x9,x25,x25,#32 + extr x27,x25,x24,#63 + and w8,w7,w9,asr#31 + eor x26,x8,x24,lsl#1 +.Lxts_8_blocks_process_gb: + cmp x2,#8 + mov v16.d[0],x12 + mov v16.d[1],x13 +#ifdef __AARCH64EB__ + rev32 v16.16b,v16.16b +#endif + mov w7,0x87 + extr x9,x27,x27,#32 + extr x13,x27,x26,#63 + and w8,w7,w9,asr#31 + eor x12,x8,x26,lsl#1 + mov v17.d[0],x14 + mov v17.d[1],x15 +#ifdef __AARCH64EB__ + rev32 v17.16b,v17.16b +#endif + mov w7,0x87 + extr x9,x13,x13,#32 + extr x15,x13,x12,#63 + and w8,w7,w9,asr#31 + eor x14,x8,x12,lsl#1 + mov v18.d[0],x16 + mov v18.d[1],x17 +#ifdef __AARCH64EB__ + rev32 v18.16b,v18.16b +#endif + mov w7,0x87 + extr x9,x15,x15,#32 + extr x17,x15,x14,#63 + and w8,w7,w9,asr#31 + eor x16,x8,x14,lsl#1 + mov v19.d[0],x18 + mov v19.d[1],x19 +#ifdef __AARCH64EB__ + rev32 v19.16b,v19.16b +#endif + mov w7,0x87 + extr x9,x17,x17,#32 + extr x19,x17,x16,#63 + and w8,w7,w9,asr#31 + eor x18,x8,x16,lsl#1 + mov v20.d[0],x20 + mov v20.d[1],x21 +#ifdef __AARCH64EB__ + rev32 v20.16b,v20.16b +#endif + mov w7,0x87 + extr x9,x19,x19,#32 + extr x21,x19,x18,#63 + and w8,w7,w9,asr#31 + eor x20,x8,x18,lsl#1 + mov v21.d[0],x22 + mov v21.d[1],x23 +#ifdef __AARCH64EB__ + rev32 v21.16b,v21.16b +#endif + mov w7,0x87 + extr x9,x21,x21,#32 + extr x23,x21,x20,#63 + and w8,w7,w9,asr#31 + eor x22,x8,x20,lsl#1 + mov v22.d[0],x24 + mov v22.d[1],x25 +#ifdef __AARCH64EB__ + rev32 v22.16b,v22.16b +#endif + mov w7,0x87 + extr x9,x23,x23,#32 + extr x25,x23,x22,#63 + and w8,w7,w9,asr#31 + eor x24,x8,x22,lsl#1 + mov v23.d[0],x26 + mov v23.d[1],x27 +#ifdef __AARCH64EB__ + rev32 v23.16b,v23.16b +#endif + mov w7,0x87 + extr x9,x25,x25,#32 + extr x27,x25,x24,#63 + and w8,w7,w9,asr#31 + eor x26,x8,x24,lsl#1 + b.lt .Lxts_4_blocks_process_gb + ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64 + rbit v16.16b,v16.16b + rbit v17.16b,v17.16b + rbit v18.16b,v18.16b + rbit v19.16b,v19.16b + eor v4.16b, v4.16b, v16.16b + eor v5.16b, v5.16b, v17.16b + eor v6.16b, v6.16b, v18.16b + eor v7.16b, v7.16b, v19.16b + ld1 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64 + rbit v20.16b,v20.16b + rbit v21.16b,v21.16b + rbit v22.16b,v22.16b + rbit v23.16b,v23.16b + eor v8.16b, v8.16b, v20.16b + eor v9.16b, v9.16b, v21.16b + eor v10.16b, v10.16b, v22.16b + eor v11.16b, v11.16b, v23.16b +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif +#ifndef __AARCH64EB__ + rev32 v5.16b,v5.16b +#endif +#ifndef __AARCH64EB__ + rev32 v6.16b,v6.16b +#endif +#ifndef __AARCH64EB__ + rev32 v7.16b,v7.16b +#endif +#ifndef __AARCH64EB__ + rev32 v8.16b,v8.16b +#endif +#ifndef __AARCH64EB__ + rev32 v9.16b,v9.16b +#endif +#ifndef __AARCH64EB__ + rev32 v10.16b,v10.16b +#endif +#ifndef __AARCH64EB__ + rev32 v11.16b,v11.16b +#endif + zip1 v0.4s,v4.4s,v5.4s + zip2 v1.4s,v4.4s,v5.4s + zip1 v2.4s,v6.4s,v7.4s + zip2 v3.4s,v6.4s,v7.4s + zip1 v4.2d,v0.2d,v2.2d + zip2 v5.2d,v0.2d,v2.2d + zip1 v6.2d,v1.2d,v3.2d + zip2 v7.2d,v1.2d,v3.2d + zip1 v0.4s,v8.4s,v9.4s + zip2 v1.4s,v8.4s,v9.4s + zip1 v2.4s,v10.4s,v11.4s + zip2 v3.4s,v10.4s,v11.4s + zip1 v8.2d,v0.2d,v2.2d + zip2 v9.2d,v0.2d,v2.2d + zip1 v10.2d,v1.2d,v3.2d + zip2 v11.2d,v1.2d,v3.2d + bl _vpsm4_ex_enc_8blks + zip1 v8.4s,v0.4s,v1.4s + zip2 v9.4s,v0.4s,v1.4s + zip1 v10.4s,v2.4s,v3.4s + zip2 v11.4s,v2.4s,v3.4s + zip1 v0.2d,v8.2d,v10.2d + zip2 v1.2d,v8.2d,v10.2d + zip1 v2.2d,v9.2d,v11.2d + zip2 v3.2d,v9.2d,v11.2d + zip1 v8.4s,v4.4s,v5.4s + zip2 v9.4s,v4.4s,v5.4s + zip1 v10.4s,v6.4s,v7.4s + zip2 v11.4s,v6.4s,v7.4s + zip1 v4.2d,v8.2d,v10.2d + zip2 v5.2d,v8.2d,v10.2d + zip1 v6.2d,v9.2d,v11.2d + zip2 v7.2d,v9.2d,v11.2d + eor v0.16b, v0.16b, v16.16b + eor v1.16b, v1.16b, v17.16b + eor v2.16b, v2.16b, v18.16b + eor v3.16b, v3.16b, v19.16b + eor v4.16b, v4.16b, v20.16b + eor v5.16b, v5.16b, v21.16b + eor v6.16b, v6.16b, v22.16b + eor v7.16b, v7.16b, v23.16b + + // save the last tweak + mov v25.16b,v23.16b + st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64 + st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64 + subs x2,x2,#8 + b.gt .Lxts_8_blocks_process_gb + b 100f +.Lxts_4_blocks_process_gb: + cmp x2,#4 + b.lt 1f + ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64 + rbit v16.16b,v16.16b + rbit v17.16b,v17.16b + rbit v18.16b,v18.16b + rbit v19.16b,v19.16b + eor v4.16b, v4.16b, v16.16b + eor v5.16b, v5.16b, v17.16b + eor v6.16b, v6.16b, v18.16b + eor v7.16b, v7.16b, v19.16b +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif +#ifndef __AARCH64EB__ + rev32 v5.16b,v5.16b +#endif +#ifndef __AARCH64EB__ + rev32 v6.16b,v6.16b +#endif +#ifndef __AARCH64EB__ + rev32 v7.16b,v7.16b +#endif + zip1 v0.4s,v4.4s,v5.4s + zip2 v1.4s,v4.4s,v5.4s + zip1 v2.4s,v6.4s,v7.4s + zip2 v3.4s,v6.4s,v7.4s + zip1 v4.2d,v0.2d,v2.2d + zip2 v5.2d,v0.2d,v2.2d + zip1 v6.2d,v1.2d,v3.2d + zip2 v7.2d,v1.2d,v3.2d + bl _vpsm4_ex_enc_4blks + zip1 v4.4s,v0.4s,v1.4s + zip2 v5.4s,v0.4s,v1.4s + zip1 v6.4s,v2.4s,v3.4s + zip2 v7.4s,v2.4s,v3.4s + zip1 v0.2d,v4.2d,v6.2d + zip2 v1.2d,v4.2d,v6.2d + zip1 v2.2d,v5.2d,v7.2d + zip2 v3.2d,v5.2d,v7.2d + eor v0.16b, v0.16b, v16.16b + eor v1.16b, v1.16b, v17.16b + eor v2.16b, v2.16b, v18.16b + eor v3.16b, v3.16b, v19.16b + st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64 + sub x2,x2,#4 + mov v16.16b,v20.16b + mov v17.16b,v21.16b + mov v18.16b,v22.16b + // save the last tweak + mov v25.16b,v19.16b +1: + // process last block + cmp x2,#1 + b.lt 100f + b.gt 1f + ld1 {v4.4s},[x0],#16 + rbit v16.16b,v16.16b + eor v4.16b, v4.16b, v16.16b +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif + mov x10,x3 + mov w11,#8 + mov w12,v4.s[0] + mov w13,v4.s[1] + mov w14,v4.s[2] + mov w15,v4.s[3] +10: + ldp w7,w8,[x10],8 + // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) + eor w6,w14,w15 + eor w9,w7,w13 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w12,w12,w6 + // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) + eor w6,w14,w15 + eor w9,w12,w8 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + ldp w7,w8,[x10],8 + eor w13,w13,w6 + // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) + eor w6,w12,w13 + eor w9,w7,w15 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w14,w14,w6 + // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) + eor w6,w12,w13 + eor w9,w14,w8 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w15,w15,w6 + subs w11,w11,#1 + b.ne 10b + mov v4.s[0],w15 + mov v4.s[1],w14 + mov v4.s[2],w13 + mov v4.s[3],w12 +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif + eor v4.16b, v4.16b, v16.16b + st1 {v4.4s},[x1],#16 + // save the last tweak + mov v25.16b,v16.16b + b 100f +1: // process last 2 blocks + cmp x2,#2 + b.gt 1f + ld1 {v4.4s,v5.4s},[x0],#32 + rbit v16.16b,v16.16b + rbit v17.16b,v17.16b + eor v4.16b, v4.16b, v16.16b + eor v5.16b, v5.16b, v17.16b +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif +#ifndef __AARCH64EB__ + rev32 v5.16b,v5.16b +#endif + zip1 v0.4s,v4.4s,v5.4s + zip2 v1.4s,v4.4s,v5.4s + zip1 v2.4s,v6.4s,v7.4s + zip2 v3.4s,v6.4s,v7.4s + zip1 v4.2d,v0.2d,v2.2d + zip2 v5.2d,v0.2d,v2.2d + zip1 v6.2d,v1.2d,v3.2d + zip2 v7.2d,v1.2d,v3.2d + bl _vpsm4_ex_enc_4blks + zip1 v4.4s,v0.4s,v1.4s + zip2 v5.4s,v0.4s,v1.4s + zip1 v6.4s,v2.4s,v3.4s + zip2 v7.4s,v2.4s,v3.4s + zip1 v0.2d,v4.2d,v6.2d + zip2 v1.2d,v4.2d,v6.2d + zip1 v2.2d,v5.2d,v7.2d + zip2 v3.2d,v5.2d,v7.2d + eor v0.16b, v0.16b, v16.16b + eor v1.16b, v1.16b, v17.16b + st1 {v0.4s,v1.4s},[x1],#32 + // save the last tweak + mov v25.16b,v17.16b + b 100f +1: // process last 3 blocks + ld1 {v4.4s,v5.4s,v6.4s},[x0],#48 + rbit v16.16b,v16.16b + rbit v17.16b,v17.16b + rbit v18.16b,v18.16b + eor v4.16b, v4.16b, v16.16b + eor v5.16b, v5.16b, v17.16b + eor v6.16b, v6.16b, v18.16b +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif +#ifndef __AARCH64EB__ + rev32 v5.16b,v5.16b +#endif +#ifndef __AARCH64EB__ + rev32 v6.16b,v6.16b +#endif + zip1 v0.4s,v4.4s,v5.4s + zip2 v1.4s,v4.4s,v5.4s + zip1 v2.4s,v6.4s,v7.4s + zip2 v3.4s,v6.4s,v7.4s + zip1 v4.2d,v0.2d,v2.2d + zip2 v5.2d,v0.2d,v2.2d + zip1 v6.2d,v1.2d,v3.2d + zip2 v7.2d,v1.2d,v3.2d + bl _vpsm4_ex_enc_4blks + zip1 v4.4s,v0.4s,v1.4s + zip2 v5.4s,v0.4s,v1.4s + zip1 v6.4s,v2.4s,v3.4s + zip2 v7.4s,v2.4s,v3.4s + zip1 v0.2d,v4.2d,v6.2d + zip2 v1.2d,v4.2d,v6.2d + zip1 v2.2d,v5.2d,v7.2d + zip2 v3.2d,v5.2d,v7.2d + eor v0.16b, v0.16b, v16.16b + eor v1.16b, v1.16b, v17.16b + eor v2.16b, v2.16b, v18.16b + st1 {v0.4s,v1.4s,v2.4s},[x1],#48 + // save the last tweak + mov v25.16b,v18.16b +100: + cmp x29,0 + b.eq .return_gb + +// This branch calculates the last two tweaks, +// while the encryption/decryption length is larger than 32 +.last_2blks_tweak_gb: +#ifdef __AARCH64EB__ + rev32 v25.16b,v25.16b +#endif + rbit v2.16b,v25.16b + adrp x9, .Lxts_magic + ldr q0, [x9, #:lo12:.Lxts_magic] + shl v17.16b, v2.16b, #1 + ext v1.16b, v2.16b, v2.16b,#15 + ushr v1.16b, v1.16b, #7 + mul v1.16b, v1.16b, v0.16b + eor v17.16b, v17.16b, v1.16b + rbit v17.16b,v17.16b + rbit v2.16b,v17.16b + adrp x9, .Lxts_magic + ldr q0, [x9, #:lo12:.Lxts_magic] + shl v18.16b, v2.16b, #1 + ext v1.16b, v2.16b, v2.16b,#15 + ushr v1.16b, v1.16b, #7 + mul v1.16b, v1.16b, v0.16b + eor v18.16b, v18.16b, v1.16b + rbit v18.16b,v18.16b + b .check_dec_gb + + +// This branch calculates the last two tweaks, +// while the encryption/decryption length is equal to 32, who only need two tweaks +.only_2blks_tweak_gb: + mov v17.16b,v16.16b +#ifdef __AARCH64EB__ + rev32 v17.16b,v17.16b +#endif + rbit v2.16b,v17.16b + adrp x9, .Lxts_magic + ldr q0, [x9, #:lo12:.Lxts_magic] + shl v18.16b, v2.16b, #1 + ext v1.16b, v2.16b, v2.16b,#15 + ushr v1.16b, v1.16b, #7 + mul v1.16b, v1.16b, v0.16b + eor v18.16b, v18.16b, v1.16b + rbit v18.16b,v18.16b + b .check_dec_gb + + +// Determine whether encryption or decryption is required. +// The last two tweaks need to be swapped for decryption. +.check_dec_gb: + // encryption:1 decryption:0 + cmp w28,1 + b.eq .process_last_2blks_gb + mov v0.16B,v17.16b + mov v17.16B,v18.16b + mov v18.16B,v0.16b + +.process_last_2blks_gb: +#ifdef __AARCH64EB__ + rev32 v17.16b,v17.16b +#endif +#ifdef __AARCH64EB__ + rev32 v18.16b,v18.16b +#endif + ld1 {v4.4s},[x0],#16 + eor v4.16b, v4.16b, v17.16b +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif + mov x10,x3 + mov w11,#8 + mov w12,v4.s[0] + mov w13,v4.s[1] + mov w14,v4.s[2] + mov w15,v4.s[3] +10: + ldp w7,w8,[x10],8 + // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) + eor w6,w14,w15 + eor w9,w7,w13 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w12,w12,w6 + // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) + eor w6,w14,w15 + eor w9,w12,w8 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + ldp w7,w8,[x10],8 + eor w13,w13,w6 + // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) + eor w6,w12,w13 + eor w9,w7,w15 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w14,w14,w6 + // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) + eor w6,w12,w13 + eor w9,w14,w8 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w15,w15,w6 + subs w11,w11,#1 + b.ne 10b + mov v4.s[0],w15 + mov v4.s[1],w14 + mov v4.s[2],w13 + mov v4.s[3],w12 +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif + eor v4.16b, v4.16b, v17.16b + st1 {v4.4s},[x1],#16 + + sub x26,x1,16 +.loop_gb: + subs x29,x29,1 + ldrb w7,[x26,x29] + ldrb w8,[x0,x29] + strb w8,[x26,x29] + strb w7,[x1,x29] + b.gt .loop_gb + ld1 {v4.4s}, [x26] + eor v4.16b, v4.16b, v18.16b +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif + mov x10,x3 + mov w11,#8 + mov w12,v4.s[0] + mov w13,v4.s[1] + mov w14,v4.s[2] + mov w15,v4.s[3] +10: + ldp w7,w8,[x10],8 + // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) + eor w6,w14,w15 + eor w9,w7,w13 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w12,w12,w6 + // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) + eor w6,w14,w15 + eor w9,w12,w8 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + ldp w7,w8,[x10],8 + eor w13,w13,w6 + // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) + eor w6,w12,w13 + eor w9,w7,w15 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w14,w14,w6 + // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) + eor w6,w12,w13 + eor w9,w14,w8 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w15,w15,w6 + subs w11,w11,#1 + b.ne 10b + mov v4.s[0],w15 + mov v4.s[1],w14 + mov v4.s[2],w13 + mov v4.s[3],w12 +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif + eor v4.16b, v4.16b, v18.16b + st1 {v4.4s}, [x26] +.return_gb: + ldp d14, d15, [sp], #0x10 + ldp d12, d13, [sp], #0x10 + ldp d10, d11, [sp], #0x10 + ldp d8, d9, [sp], #0x10 + ldp x29, x30, [sp], #0x10 + ldp x27, x28, [sp], #0x10 + ldp x25, x26, [sp], #0x10 + ldp x23, x24, [sp], #0x10 + ldp x21, x22, [sp], #0x10 + ldp x19, x20, [sp], #0x10 + ldp x17, x18, [sp], #0x10 + ldp x15, x16, [sp], #0x10 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size vpsm4_ex_xts_encrypt_gb,.-vpsm4_ex_xts_encrypt_gb +.globl vpsm4_ex_xts_encrypt +.type vpsm4_ex_xts_encrypt,%function +.align 5 +vpsm4_ex_xts_encrypt: + AARCH64_SIGN_LINK_REGISTER + stp x15, x16, [sp, #-0x10]! + stp x17, x18, [sp, #-0x10]! + stp x19, x20, [sp, #-0x10]! + stp x21, x22, [sp, #-0x10]! + stp x23, x24, [sp, #-0x10]! + stp x25, x26, [sp, #-0x10]! + stp x27, x28, [sp, #-0x10]! + stp x29, x30, [sp, #-0x10]! + stp d8, d9, [sp, #-0x10]! + stp d10, d11, [sp, #-0x10]! + stp d12, d13, [sp, #-0x10]! + stp d14, d15, [sp, #-0x10]! + mov x26,x3 + mov x27,x4 + mov w28,w6 + ld1 {v16.4s}, [x5] + mov x3,x27 + adrp x9, .Lsbox_magic + ldr q26, [x9, #:lo12:.Lsbox_magic] + ldr q27, [x9, #:lo12:.Lsbox_magic+16] + ldr q28, [x9, #:lo12:.Lsbox_magic+32] + ldr q29, [x9, #:lo12:.Lsbox_magic+48] + ldr q30, [x9, #:lo12:.Lsbox_magic+64] + ldr q31, [x9, #:lo12:.Lsbox_magic+80] +#ifndef __AARCH64EB__ + rev32 v16.16b,v16.16b +#endif + mov x10,x3 + mov w11,#8 + mov w12,v16.s[0] + mov w13,v16.s[1] + mov w14,v16.s[2] + mov w15,v16.s[3] +10: + ldp w7,w8,[x10],8 + // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) + eor w6,w14,w15 + eor w9,w7,w13 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w12,w12,w6 + // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) + eor w6,w14,w15 + eor w9,w12,w8 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + ldp w7,w8,[x10],8 + eor w13,w13,w6 + // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) + eor w6,w12,w13 + eor w9,w7,w15 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w14,w14,w6 + // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) + eor w6,w12,w13 + eor w9,w14,w8 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w15,w15,w6 + subs w11,w11,#1 + b.ne 10b + mov v16.s[0],w15 + mov v16.s[1],w14 + mov v16.s[2],w13 + mov v16.s[3],w12 +#ifndef __AARCH64EB__ + rev32 v16.16b,v16.16b +#endif + mov x3,x26 + and x29,x2,#0x0F + // convert length into blocks + lsr x2,x2,4 + cmp x2,#1 + b.lt .return + + cmp x29,0 + // If the encryption/decryption Length is N times of 16, + // the all blocks are encrypted/decrypted in .xts_encrypt_blocks + b.eq .xts_encrypt_blocks + + // If the encryption/decryption length is not N times of 16, + // the last two blocks are encrypted/decrypted in .last_2blks_tweak or .only_2blks_tweak + // the other blocks are encrypted/decrypted in .xts_encrypt_blocks + subs x2,x2,#1 + b.eq .only_2blks_tweak +.xts_encrypt_blocks: +#ifdef __AARCH64EB__ + rev32 v16.16b,v16.16b +#endif + mov x12,v16.d[0] + mov x13,v16.d[1] + mov w7,0x87 + extr x9,x13,x13,#32 + extr x15,x13,x12,#63 + and w8,w7,w9,asr#31 + eor x14,x8,x12,lsl#1 + mov w7,0x87 + extr x9,x15,x15,#32 + extr x17,x15,x14,#63 + and w8,w7,w9,asr#31 + eor x16,x8,x14,lsl#1 + mov w7,0x87 + extr x9,x17,x17,#32 + extr x19,x17,x16,#63 + and w8,w7,w9,asr#31 + eor x18,x8,x16,lsl#1 + mov w7,0x87 + extr x9,x19,x19,#32 + extr x21,x19,x18,#63 + and w8,w7,w9,asr#31 + eor x20,x8,x18,lsl#1 + mov w7,0x87 + extr x9,x21,x21,#32 + extr x23,x21,x20,#63 + and w8,w7,w9,asr#31 + eor x22,x8,x20,lsl#1 + mov w7,0x87 + extr x9,x23,x23,#32 + extr x25,x23,x22,#63 + and w8,w7,w9,asr#31 + eor x24,x8,x22,lsl#1 + mov w7,0x87 + extr x9,x25,x25,#32 + extr x27,x25,x24,#63 + and w8,w7,w9,asr#31 + eor x26,x8,x24,lsl#1 +.Lxts_8_blocks_process: + cmp x2,#8 + mov v16.d[0],x12 + mov v16.d[1],x13 +#ifdef __AARCH64EB__ + rev32 v16.16b,v16.16b +#endif + mov w7,0x87 + extr x9,x27,x27,#32 + extr x13,x27,x26,#63 + and w8,w7,w9,asr#31 + eor x12,x8,x26,lsl#1 + mov v17.d[0],x14 + mov v17.d[1],x15 +#ifdef __AARCH64EB__ + rev32 v17.16b,v17.16b +#endif + mov w7,0x87 + extr x9,x13,x13,#32 + extr x15,x13,x12,#63 + and w8,w7,w9,asr#31 + eor x14,x8,x12,lsl#1 + mov v18.d[0],x16 + mov v18.d[1],x17 +#ifdef __AARCH64EB__ + rev32 v18.16b,v18.16b +#endif + mov w7,0x87 + extr x9,x15,x15,#32 + extr x17,x15,x14,#63 + and w8,w7,w9,asr#31 + eor x16,x8,x14,lsl#1 + mov v19.d[0],x18 + mov v19.d[1],x19 +#ifdef __AARCH64EB__ + rev32 v19.16b,v19.16b +#endif + mov w7,0x87 + extr x9,x17,x17,#32 + extr x19,x17,x16,#63 + and w8,w7,w9,asr#31 + eor x18,x8,x16,lsl#1 + mov v20.d[0],x20 + mov v20.d[1],x21 +#ifdef __AARCH64EB__ + rev32 v20.16b,v20.16b +#endif + mov w7,0x87 + extr x9,x19,x19,#32 + extr x21,x19,x18,#63 + and w8,w7,w9,asr#31 + eor x20,x8,x18,lsl#1 + mov v21.d[0],x22 + mov v21.d[1],x23 +#ifdef __AARCH64EB__ + rev32 v21.16b,v21.16b +#endif + mov w7,0x87 + extr x9,x21,x21,#32 + extr x23,x21,x20,#63 + and w8,w7,w9,asr#31 + eor x22,x8,x20,lsl#1 + mov v22.d[0],x24 + mov v22.d[1],x25 +#ifdef __AARCH64EB__ + rev32 v22.16b,v22.16b +#endif + mov w7,0x87 + extr x9,x23,x23,#32 + extr x25,x23,x22,#63 + and w8,w7,w9,asr#31 + eor x24,x8,x22,lsl#1 + mov v23.d[0],x26 + mov v23.d[1],x27 +#ifdef __AARCH64EB__ + rev32 v23.16b,v23.16b +#endif + mov w7,0x87 + extr x9,x25,x25,#32 + extr x27,x25,x24,#63 + and w8,w7,w9,asr#31 + eor x26,x8,x24,lsl#1 + b.lt .Lxts_4_blocks_process + ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64 + eor v4.16b, v4.16b, v16.16b + eor v5.16b, v5.16b, v17.16b + eor v6.16b, v6.16b, v18.16b + eor v7.16b, v7.16b, v19.16b + ld1 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64 + eor v8.16b, v8.16b, v20.16b + eor v9.16b, v9.16b, v21.16b + eor v10.16b, v10.16b, v22.16b + eor v11.16b, v11.16b, v23.16b +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif +#ifndef __AARCH64EB__ + rev32 v5.16b,v5.16b +#endif +#ifndef __AARCH64EB__ + rev32 v6.16b,v6.16b +#endif +#ifndef __AARCH64EB__ + rev32 v7.16b,v7.16b +#endif +#ifndef __AARCH64EB__ + rev32 v8.16b,v8.16b +#endif +#ifndef __AARCH64EB__ + rev32 v9.16b,v9.16b +#endif +#ifndef __AARCH64EB__ + rev32 v10.16b,v10.16b +#endif +#ifndef __AARCH64EB__ + rev32 v11.16b,v11.16b +#endif + zip1 v0.4s,v4.4s,v5.4s + zip2 v1.4s,v4.4s,v5.4s + zip1 v2.4s,v6.4s,v7.4s + zip2 v3.4s,v6.4s,v7.4s + zip1 v4.2d,v0.2d,v2.2d + zip2 v5.2d,v0.2d,v2.2d + zip1 v6.2d,v1.2d,v3.2d + zip2 v7.2d,v1.2d,v3.2d + zip1 v0.4s,v8.4s,v9.4s + zip2 v1.4s,v8.4s,v9.4s + zip1 v2.4s,v10.4s,v11.4s + zip2 v3.4s,v10.4s,v11.4s + zip1 v8.2d,v0.2d,v2.2d + zip2 v9.2d,v0.2d,v2.2d + zip1 v10.2d,v1.2d,v3.2d + zip2 v11.2d,v1.2d,v3.2d + bl _vpsm4_ex_enc_8blks + zip1 v8.4s,v0.4s,v1.4s + zip2 v9.4s,v0.4s,v1.4s + zip1 v10.4s,v2.4s,v3.4s + zip2 v11.4s,v2.4s,v3.4s + zip1 v0.2d,v8.2d,v10.2d + zip2 v1.2d,v8.2d,v10.2d + zip1 v2.2d,v9.2d,v11.2d + zip2 v3.2d,v9.2d,v11.2d + zip1 v8.4s,v4.4s,v5.4s + zip2 v9.4s,v4.4s,v5.4s + zip1 v10.4s,v6.4s,v7.4s + zip2 v11.4s,v6.4s,v7.4s + zip1 v4.2d,v8.2d,v10.2d + zip2 v5.2d,v8.2d,v10.2d + zip1 v6.2d,v9.2d,v11.2d + zip2 v7.2d,v9.2d,v11.2d + eor v0.16b, v0.16b, v16.16b + eor v1.16b, v1.16b, v17.16b + eor v2.16b, v2.16b, v18.16b + eor v3.16b, v3.16b, v19.16b + eor v4.16b, v4.16b, v20.16b + eor v5.16b, v5.16b, v21.16b + eor v6.16b, v6.16b, v22.16b + eor v7.16b, v7.16b, v23.16b + + // save the last tweak + mov v25.16b,v23.16b + st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64 + st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64 + subs x2,x2,#8 + b.gt .Lxts_8_blocks_process + b 100f +.Lxts_4_blocks_process: + cmp x2,#4 + b.lt 1f + ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64 + eor v4.16b, v4.16b, v16.16b + eor v5.16b, v5.16b, v17.16b + eor v6.16b, v6.16b, v18.16b + eor v7.16b, v7.16b, v19.16b +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif +#ifndef __AARCH64EB__ + rev32 v5.16b,v5.16b +#endif +#ifndef __AARCH64EB__ + rev32 v6.16b,v6.16b +#endif +#ifndef __AARCH64EB__ + rev32 v7.16b,v7.16b +#endif + zip1 v0.4s,v4.4s,v5.4s + zip2 v1.4s,v4.4s,v5.4s + zip1 v2.4s,v6.4s,v7.4s + zip2 v3.4s,v6.4s,v7.4s + zip1 v4.2d,v0.2d,v2.2d + zip2 v5.2d,v0.2d,v2.2d + zip1 v6.2d,v1.2d,v3.2d + zip2 v7.2d,v1.2d,v3.2d + bl _vpsm4_ex_enc_4blks + zip1 v4.4s,v0.4s,v1.4s + zip2 v5.4s,v0.4s,v1.4s + zip1 v6.4s,v2.4s,v3.4s + zip2 v7.4s,v2.4s,v3.4s + zip1 v0.2d,v4.2d,v6.2d + zip2 v1.2d,v4.2d,v6.2d + zip1 v2.2d,v5.2d,v7.2d + zip2 v3.2d,v5.2d,v7.2d + eor v0.16b, v0.16b, v16.16b + eor v1.16b, v1.16b, v17.16b + eor v2.16b, v2.16b, v18.16b + eor v3.16b, v3.16b, v19.16b + st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64 + sub x2,x2,#4 + mov v16.16b,v20.16b + mov v17.16b,v21.16b + mov v18.16b,v22.16b + // save the last tweak + mov v25.16b,v19.16b +1: + // process last block + cmp x2,#1 + b.lt 100f + b.gt 1f + ld1 {v4.4s},[x0],#16 + eor v4.16b, v4.16b, v16.16b +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif + mov x10,x3 + mov w11,#8 + mov w12,v4.s[0] + mov w13,v4.s[1] + mov w14,v4.s[2] + mov w15,v4.s[3] +10: + ldp w7,w8,[x10],8 + // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) + eor w6,w14,w15 + eor w9,w7,w13 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w12,w12,w6 + // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) + eor w6,w14,w15 + eor w9,w12,w8 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + ldp w7,w8,[x10],8 + eor w13,w13,w6 + // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) + eor w6,w12,w13 + eor w9,w7,w15 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w14,w14,w6 + // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) + eor w6,w12,w13 + eor w9,w14,w8 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w15,w15,w6 + subs w11,w11,#1 + b.ne 10b + mov v4.s[0],w15 + mov v4.s[1],w14 + mov v4.s[2],w13 + mov v4.s[3],w12 +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif + eor v4.16b, v4.16b, v16.16b + st1 {v4.4s},[x1],#16 + // save the last tweak + mov v25.16b,v16.16b + b 100f +1: // process last 2 blocks + cmp x2,#2 + b.gt 1f + ld1 {v4.4s,v5.4s},[x0],#32 + eor v4.16b, v4.16b, v16.16b + eor v5.16b, v5.16b, v17.16b +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif +#ifndef __AARCH64EB__ + rev32 v5.16b,v5.16b +#endif + zip1 v0.4s,v4.4s,v5.4s + zip2 v1.4s,v4.4s,v5.4s + zip1 v2.4s,v6.4s,v7.4s + zip2 v3.4s,v6.4s,v7.4s + zip1 v4.2d,v0.2d,v2.2d + zip2 v5.2d,v0.2d,v2.2d + zip1 v6.2d,v1.2d,v3.2d + zip2 v7.2d,v1.2d,v3.2d + bl _vpsm4_ex_enc_4blks + zip1 v4.4s,v0.4s,v1.4s + zip2 v5.4s,v0.4s,v1.4s + zip1 v6.4s,v2.4s,v3.4s + zip2 v7.4s,v2.4s,v3.4s + zip1 v0.2d,v4.2d,v6.2d + zip2 v1.2d,v4.2d,v6.2d + zip1 v2.2d,v5.2d,v7.2d + zip2 v3.2d,v5.2d,v7.2d + eor v0.16b, v0.16b, v16.16b + eor v1.16b, v1.16b, v17.16b + st1 {v0.4s,v1.4s},[x1],#32 + // save the last tweak + mov v25.16b,v17.16b + b 100f +1: // process last 3 blocks + ld1 {v4.4s,v5.4s,v6.4s},[x0],#48 + eor v4.16b, v4.16b, v16.16b + eor v5.16b, v5.16b, v17.16b + eor v6.16b, v6.16b, v18.16b +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif +#ifndef __AARCH64EB__ + rev32 v5.16b,v5.16b +#endif +#ifndef __AARCH64EB__ + rev32 v6.16b,v6.16b +#endif + zip1 v0.4s,v4.4s,v5.4s + zip2 v1.4s,v4.4s,v5.4s + zip1 v2.4s,v6.4s,v7.4s + zip2 v3.4s,v6.4s,v7.4s + zip1 v4.2d,v0.2d,v2.2d + zip2 v5.2d,v0.2d,v2.2d + zip1 v6.2d,v1.2d,v3.2d + zip2 v7.2d,v1.2d,v3.2d + bl _vpsm4_ex_enc_4blks + zip1 v4.4s,v0.4s,v1.4s + zip2 v5.4s,v0.4s,v1.4s + zip1 v6.4s,v2.4s,v3.4s + zip2 v7.4s,v2.4s,v3.4s + zip1 v0.2d,v4.2d,v6.2d + zip2 v1.2d,v4.2d,v6.2d + zip1 v2.2d,v5.2d,v7.2d + zip2 v3.2d,v5.2d,v7.2d + eor v0.16b, v0.16b, v16.16b + eor v1.16b, v1.16b, v17.16b + eor v2.16b, v2.16b, v18.16b + st1 {v0.4s,v1.4s,v2.4s},[x1],#48 + // save the last tweak + mov v25.16b,v18.16b +100: + cmp x29,0 + b.eq .return + +// This branch calculates the last two tweaks, +// while the encryption/decryption length is larger than 32 +.last_2blks_tweak: +#ifdef __AARCH64EB__ + rev32 v25.16b,v25.16b +#endif + mov v2.16b,v25.16b + adrp x9, .Lxts_magic + ldr q0, [x9, #:lo12:.Lxts_magic] + shl v17.16b, v2.16b, #1 + ext v1.16b, v2.16b, v2.16b,#15 + ushr v1.16b, v1.16b, #7 + mul v1.16b, v1.16b, v0.16b + eor v17.16b, v17.16b, v1.16b + mov v2.16b,v17.16b + adrp x9, .Lxts_magic + ldr q0, [x9, #:lo12:.Lxts_magic] + shl v18.16b, v2.16b, #1 + ext v1.16b, v2.16b, v2.16b,#15 + ushr v1.16b, v1.16b, #7 + mul v1.16b, v1.16b, v0.16b + eor v18.16b, v18.16b, v1.16b + b .check_dec + + +// This branch calculates the last two tweaks, +// while the encryption/decryption length is equal to 32, who only need two tweaks +.only_2blks_tweak: + mov v17.16b,v16.16b +#ifdef __AARCH64EB__ + rev32 v17.16b,v17.16b +#endif + mov v2.16b,v17.16b + adrp x9, .Lxts_magic + ldr q0, [x9, #:lo12:.Lxts_magic] + shl v18.16b, v2.16b, #1 + ext v1.16b, v2.16b, v2.16b,#15 + ushr v1.16b, v1.16b, #7 + mul v1.16b, v1.16b, v0.16b + eor v18.16b, v18.16b, v1.16b + b .check_dec + + +// Determine whether encryption or decryption is required. +// The last two tweaks need to be swapped for decryption. +.check_dec: + // encryption:1 decryption:0 + cmp w28,1 + b.eq .process_last_2blks + mov v0.16B,v17.16b + mov v17.16B,v18.16b + mov v18.16B,v0.16b + +.process_last_2blks: +#ifdef __AARCH64EB__ + rev32 v17.16b,v17.16b +#endif +#ifdef __AARCH64EB__ + rev32 v18.16b,v18.16b +#endif + ld1 {v4.4s},[x0],#16 + eor v4.16b, v4.16b, v17.16b +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif + mov x10,x3 + mov w11,#8 + mov w12,v4.s[0] + mov w13,v4.s[1] + mov w14,v4.s[2] + mov w15,v4.s[3] +10: + ldp w7,w8,[x10],8 + // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) + eor w6,w14,w15 + eor w9,w7,w13 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w12,w12,w6 + // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) + eor w6,w14,w15 + eor w9,w12,w8 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + ldp w7,w8,[x10],8 + eor w13,w13,w6 + // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) + eor w6,w12,w13 + eor w9,w7,w15 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w14,w14,w6 + // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) + eor w6,w12,w13 + eor w9,w14,w8 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w15,w15,w6 + subs w11,w11,#1 + b.ne 10b + mov v4.s[0],w15 + mov v4.s[1],w14 + mov v4.s[2],w13 + mov v4.s[3],w12 +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif + eor v4.16b, v4.16b, v17.16b + st1 {v4.4s},[x1],#16 + + sub x26,x1,16 +.loop: + subs x29,x29,1 + ldrb w7,[x26,x29] + ldrb w8,[x0,x29] + strb w8,[x26,x29] + strb w7,[x1,x29] + b.gt .loop + ld1 {v4.4s}, [x26] + eor v4.16b, v4.16b, v18.16b +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif + mov x10,x3 + mov w11,#8 + mov w12,v4.s[0] + mov w13,v4.s[1] + mov w14,v4.s[2] + mov w15,v4.s[3] +10: + ldp w7,w8,[x10],8 + // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) + eor w6,w14,w15 + eor w9,w7,w13 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w12,w12,w6 + // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) + eor w6,w14,w15 + eor w9,w12,w8 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + ldp w7,w8,[x10],8 + eor w13,w13,w6 + // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) + eor w6,w12,w13 + eor w9,w7,w15 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w14,w14,w6 + // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) + eor w6,w12,w13 + eor w9,w14,w8 + eor w6,w6,w9 + mov v3.s[0],w6 + // optimize sbox using AESE instruction + tbl v0.16b, {v3.16b}, v26.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v28.16b}, v0.16b + tbl v2.16b, {v27.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v1.16b + aese v0.16b,v1.16b + ushr v2.16b, v0.16b, 4 + and v0.16b, v0.16b, v31.16b + tbl v0.16b, {v30.16b}, v0.16b + tbl v2.16b, {v29.16b}, v2.16b + eor v0.16b, v0.16b, v2.16b + + mov w7,v0.s[0] + eor w6,w7,w7,ror #32-2 + eor w6,w6,w7,ror #32-10 + eor w6,w6,w7,ror #32-18 + eor w6,w6,w7,ror #32-24 + eor w15,w15,w6 + subs w11,w11,#1 + b.ne 10b + mov v4.s[0],w15 + mov v4.s[1],w14 + mov v4.s[2],w13 + mov v4.s[3],w12 +#ifndef __AARCH64EB__ + rev32 v4.16b,v4.16b +#endif + eor v4.16b, v4.16b, v18.16b + st1 {v4.4s}, [x26] +.return: + ldp d14, d15, [sp], #0x10 + ldp d12, d13, [sp], #0x10 + ldp d10, d11, [sp], #0x10 + ldp d8, d9, [sp], #0x10 + ldp x29, x30, [sp], #0x10 + ldp x27, x28, [sp], #0x10 + ldp x25, x26, [sp], #0x10 + ldp x23, x24, [sp], #0x10 + ldp x21, x22, [sp], #0x10 + ldp x19, x20, [sp], #0x10 + ldp x17, x18, [sp], #0x10 + ldp x15, x16, [sp], #0x10 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size vpsm4_ex_xts_encrypt,.-vpsm4_ex_xts_encrypt |