diff options
Diffstat (limited to 'sys/contrib/openzfs/module/icp/asm-x86_64/modes/aesni-gcm-avx2-vaes.S')
| -rw-r--r-- | sys/contrib/openzfs/module/icp/asm-x86_64/modes/aesni-gcm-avx2-vaes.S | 1323 |
1 files changed, 1323 insertions, 0 deletions
diff --git a/sys/contrib/openzfs/module/icp/asm-x86_64/modes/aesni-gcm-avx2-vaes.S b/sys/contrib/openzfs/module/icp/asm-x86_64/modes/aesni-gcm-avx2-vaes.S new file mode 100644 index 000000000000..3d1b045127e2 --- /dev/null +++ b/sys/contrib/openzfs/module/icp/asm-x86_64/modes/aesni-gcm-avx2-vaes.S @@ -0,0 +1,1323 @@ +// SPDX-License-Identifier: Apache-2.0 +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if defined(__x86_64__) && defined(HAVE_AVX) && \ + defined(HAVE_VAES) && defined(HAVE_VPCLMULQDQ) + +#define _ASM +#include <sys/asm_linkage.h> + +/* Windows userland links with OpenSSL */ +#if !defined (_WIN32) || defined (_KERNEL) + +.section .rodata +.balign 16 + + +.Lbswap_mask: +.quad 0x08090a0b0c0d0e0f, 0x0001020304050607 + + + + + + + + +.Lgfpoly: +.quad 1, 0xc200000000000000 + + +.Lgfpoly_and_internal_carrybit: +.quad 1, 0xc200000000000001 + +.balign 32 + +.Lctr_pattern: +.quad 0, 0 +.quad 1, 0 +.Linc_2blocks: +.quad 2, 0 +.quad 2, 0 + +ENTRY_ALIGN(gcm_init_vpclmulqdq_avx2, 32) +.cfi_startproc + +ENDBR + + + + + + vmovdqu (%rsi),%xmm3 + // KCF/ICP stores H in network byte order with the hi qword first + // so we need to swap all bytes, not the 2 qwords. + vmovdqu .Lbswap_mask(%rip),%xmm4 + vpshufb %xmm4,%xmm3,%xmm3 + + + + + + vpshufd $0xd3,%xmm3,%xmm0 + vpsrad $31,%xmm0,%xmm0 + vpaddq %xmm3,%xmm3,%xmm3 + vpand .Lgfpoly_and_internal_carrybit(%rip),%xmm0,%xmm0 + vpxor %xmm0,%xmm3,%xmm3 + + vbroadcasti128 .Lgfpoly(%rip),%ymm6 + + + vpclmulqdq $0x00,%xmm3,%xmm3,%xmm0 + vpclmulqdq $0x11,%xmm3,%xmm3,%xmm5 + vpclmulqdq $0x01,%xmm0,%xmm6,%xmm1 + vpshufd $0x4e,%xmm0,%xmm0 + vpxor %xmm0,%xmm1,%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm6,%xmm0 + vpshufd $0x4e,%xmm1,%xmm1 + vpxor %xmm1,%xmm5,%xmm5 + vpxor %xmm0,%xmm5,%xmm5 + + + + vinserti128 $1,%xmm3,%ymm5,%ymm3 + vinserti128 $1,%xmm5,%ymm5,%ymm5 + + + vpclmulqdq $0x00,%ymm5,%ymm3,%ymm0 + vpclmulqdq $0x01,%ymm5,%ymm3,%ymm1 + vpclmulqdq $0x10,%ymm5,%ymm3,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vpclmulqdq $0x01,%ymm0,%ymm6,%ymm2 + vpshufd $0x4e,%ymm0,%ymm0 + vpxor %ymm0,%ymm1,%ymm1 + vpxor %ymm2,%ymm1,%ymm1 + vpclmulqdq $0x11,%ymm5,%ymm3,%ymm4 + vpclmulqdq $0x01,%ymm1,%ymm6,%ymm0 + vpshufd $0x4e,%ymm1,%ymm1 + vpxor %ymm1,%ymm4,%ymm4 + vpxor %ymm0,%ymm4,%ymm4 + + + + vmovdqu %ymm3,96(%rdi) + vmovdqu %ymm4,64(%rdi) + + + + vpunpcklqdq %ymm3,%ymm4,%ymm0 + vpunpckhqdq %ymm3,%ymm4,%ymm1 + vpxor %ymm1,%ymm0,%ymm0 + vmovdqu %ymm0,128+32(%rdi) + + + vpclmulqdq $0x00,%ymm5,%ymm4,%ymm0 + vpclmulqdq $0x01,%ymm5,%ymm4,%ymm1 + vpclmulqdq $0x10,%ymm5,%ymm4,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vpclmulqdq $0x01,%ymm0,%ymm6,%ymm2 + vpshufd $0x4e,%ymm0,%ymm0 + vpxor %ymm0,%ymm1,%ymm1 + vpxor %ymm2,%ymm1,%ymm1 + vpclmulqdq $0x11,%ymm5,%ymm4,%ymm3 + vpclmulqdq $0x01,%ymm1,%ymm6,%ymm0 + vpshufd $0x4e,%ymm1,%ymm1 + vpxor %ymm1,%ymm3,%ymm3 + vpxor %ymm0,%ymm3,%ymm3 + + vpclmulqdq $0x00,%ymm5,%ymm3,%ymm0 + vpclmulqdq $0x01,%ymm5,%ymm3,%ymm1 + vpclmulqdq $0x10,%ymm5,%ymm3,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vpclmulqdq $0x01,%ymm0,%ymm6,%ymm2 + vpshufd $0x4e,%ymm0,%ymm0 + vpxor %ymm0,%ymm1,%ymm1 + vpxor %ymm2,%ymm1,%ymm1 + vpclmulqdq $0x11,%ymm5,%ymm3,%ymm4 + vpclmulqdq $0x01,%ymm1,%ymm6,%ymm0 + vpshufd $0x4e,%ymm1,%ymm1 + vpxor %ymm1,%ymm4,%ymm4 + vpxor %ymm0,%ymm4,%ymm4 + + vmovdqu %ymm3,32(%rdi) + vmovdqu %ymm4,0(%rdi) + + + + vpunpcklqdq %ymm3,%ymm4,%ymm0 + vpunpckhqdq %ymm3,%ymm4,%ymm1 + vpxor %ymm1,%ymm0,%ymm0 + vmovdqu %ymm0,128(%rdi) + + vzeroupper + RET + +.cfi_endproc +SET_SIZE(gcm_init_vpclmulqdq_avx2) +ENTRY_ALIGN(gcm_gmult_vpclmulqdq_avx2, 32) +.cfi_startproc + +ENDBR + + + + vmovdqu (%rdi),%xmm0 + vmovdqu .Lbswap_mask(%rip),%xmm1 + vmovdqu 128-16(%rsi),%xmm2 + vmovdqu .Lgfpoly(%rip),%xmm3 + vpshufb %xmm1,%xmm0,%xmm0 + + vpclmulqdq $0x00,%xmm2,%xmm0,%xmm4 + vpclmulqdq $0x01,%xmm2,%xmm0,%xmm5 + vpclmulqdq $0x10,%xmm2,%xmm0,%xmm6 + vpxor %xmm6,%xmm5,%xmm5 + vpclmulqdq $0x01,%xmm4,%xmm3,%xmm6 + vpshufd $0x4e,%xmm4,%xmm4 + vpxor %xmm4,%xmm5,%xmm5 + vpxor %xmm6,%xmm5,%xmm5 + vpclmulqdq $0x11,%xmm2,%xmm0,%xmm0 + vpclmulqdq $0x01,%xmm5,%xmm3,%xmm4 + vpshufd $0x4e,%xmm5,%xmm5 + vpxor %xmm5,%xmm0,%xmm0 + vpxor %xmm4,%xmm0,%xmm0 + + + vpshufb %xmm1,%xmm0,%xmm0 + vmovdqu %xmm0,(%rdi) + + + RET + +.cfi_endproc +SET_SIZE(gcm_gmult_vpclmulqdq_avx2) +ENTRY_ALIGN(gcm_ghash_vpclmulqdq_avx2, 32) +.cfi_startproc + +ENDBR + + + + + + + vmovdqu .Lbswap_mask(%rip),%xmm6 + vmovdqu .Lgfpoly(%rip),%xmm7 + + + vmovdqu (%rdi),%xmm5 + vpshufb %xmm6,%xmm5,%xmm5 + + + cmpq $32,%rcx + jb .Lghash_lastblock + + + + vinserti128 $1,%xmm6,%ymm6,%ymm6 + vinserti128 $1,%xmm7,%ymm7,%ymm7 + + cmpq $127,%rcx + jbe .Lghash_loop_1x + + + vmovdqu 128(%rsi),%ymm8 + vmovdqu 128+32(%rsi),%ymm9 +.Lghash_loop_4x: + + vmovdqu 0(%rdx),%ymm1 + vpshufb %ymm6,%ymm1,%ymm1 + vmovdqu 0(%rsi),%ymm2 + vpxor %ymm5,%ymm1,%ymm1 + vpclmulqdq $0x00,%ymm2,%ymm1,%ymm3 + vpclmulqdq $0x11,%ymm2,%ymm1,%ymm5 + vpunpckhqdq %ymm1,%ymm1,%ymm0 + vpxor %ymm1,%ymm0,%ymm0 + vpclmulqdq $0x00,%ymm8,%ymm0,%ymm4 + + vmovdqu 32(%rdx),%ymm1 + vpshufb %ymm6,%ymm1,%ymm1 + vmovdqu 32(%rsi),%ymm2 + vpclmulqdq $0x00,%ymm2,%ymm1,%ymm0 + vpxor %ymm0,%ymm3,%ymm3 + vpclmulqdq $0x11,%ymm2,%ymm1,%ymm0 + vpxor %ymm0,%ymm5,%ymm5 + vpunpckhqdq %ymm1,%ymm1,%ymm0 + vpxor %ymm1,%ymm0,%ymm0 + vpclmulqdq $0x10,%ymm8,%ymm0,%ymm0 + vpxor %ymm0,%ymm4,%ymm4 + + vmovdqu 64(%rdx),%ymm1 + vpshufb %ymm6,%ymm1,%ymm1 + vmovdqu 64(%rsi),%ymm2 + vpclmulqdq $0x00,%ymm2,%ymm1,%ymm0 + vpxor %ymm0,%ymm3,%ymm3 + vpclmulqdq $0x11,%ymm2,%ymm1,%ymm0 + vpxor %ymm0,%ymm5,%ymm5 + vpunpckhqdq %ymm1,%ymm1,%ymm0 + vpxor %ymm1,%ymm0,%ymm0 + vpclmulqdq $0x00,%ymm9,%ymm0,%ymm0 + vpxor %ymm0,%ymm4,%ymm4 + + + vmovdqu 96(%rdx),%ymm1 + vpshufb %ymm6,%ymm1,%ymm1 + vmovdqu 96(%rsi),%ymm2 + vpclmulqdq $0x00,%ymm2,%ymm1,%ymm0 + vpxor %ymm0,%ymm3,%ymm3 + vpclmulqdq $0x11,%ymm2,%ymm1,%ymm0 + vpxor %ymm0,%ymm5,%ymm5 + vpunpckhqdq %ymm1,%ymm1,%ymm0 + vpxor %ymm1,%ymm0,%ymm0 + vpclmulqdq $0x10,%ymm9,%ymm0,%ymm0 + vpxor %ymm0,%ymm4,%ymm4 + + vpxor %ymm3,%ymm4,%ymm4 + vpxor %ymm5,%ymm4,%ymm4 + + + vbroadcasti128 .Lgfpoly(%rip),%ymm2 + vpclmulqdq $0x01,%ymm3,%ymm2,%ymm0 + vpshufd $0x4e,%ymm3,%ymm3 + vpxor %ymm3,%ymm4,%ymm4 + vpxor %ymm0,%ymm4,%ymm4 + + vpclmulqdq $0x01,%ymm4,%ymm2,%ymm0 + vpshufd $0x4e,%ymm4,%ymm4 + vpxor %ymm4,%ymm5,%ymm5 + vpxor %ymm0,%ymm5,%ymm5 + vextracti128 $1,%ymm5,%xmm0 + vpxor %xmm0,%xmm5,%xmm5 + + subq $-128,%rdx + addq $-128,%rcx + cmpq $127,%rcx + ja .Lghash_loop_4x + + + cmpq $32,%rcx + jb .Lghash_loop_1x_done +.Lghash_loop_1x: + vmovdqu (%rdx),%ymm0 + vpshufb %ymm6,%ymm0,%ymm0 + vpxor %ymm0,%ymm5,%ymm5 + vmovdqu 128-32(%rsi),%ymm0 + vpclmulqdq $0x00,%ymm0,%ymm5,%ymm1 + vpclmulqdq $0x01,%ymm0,%ymm5,%ymm2 + vpclmulqdq $0x10,%ymm0,%ymm5,%ymm3 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x01,%ymm1,%ymm7,%ymm3 + vpshufd $0x4e,%ymm1,%ymm1 + vpxor %ymm1,%ymm2,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x11,%ymm0,%ymm5,%ymm5 + vpclmulqdq $0x01,%ymm2,%ymm7,%ymm1 + vpshufd $0x4e,%ymm2,%ymm2 + vpxor %ymm2,%ymm5,%ymm5 + vpxor %ymm1,%ymm5,%ymm5 + + vextracti128 $1,%ymm5,%xmm0 + vpxor %xmm0,%xmm5,%xmm5 + addq $32,%rdx + subq $32,%rcx + cmpq $32,%rcx + jae .Lghash_loop_1x +.Lghash_loop_1x_done: + + +.Lghash_lastblock: + testq %rcx,%rcx + jz .Lghash_done + vmovdqu (%rdx),%xmm0 + vpshufb %xmm6,%xmm0,%xmm0 + vpxor %xmm0,%xmm5,%xmm5 + vmovdqu 128-16(%rsi),%xmm0 + vpclmulqdq $0x00,%xmm0,%xmm5,%xmm1 + vpclmulqdq $0x01,%xmm0,%xmm5,%xmm2 + vpclmulqdq $0x10,%xmm0,%xmm5,%xmm3 + vpxor %xmm3,%xmm2,%xmm2 + vpclmulqdq $0x01,%xmm1,%xmm7,%xmm3 + vpshufd $0x4e,%xmm1,%xmm1 + vpxor %xmm1,%xmm2,%xmm2 + vpxor %xmm3,%xmm2,%xmm2 + vpclmulqdq $0x11,%xmm0,%xmm5,%xmm5 + vpclmulqdq $0x01,%xmm2,%xmm7,%xmm1 + vpshufd $0x4e,%xmm2,%xmm2 + vpxor %xmm2,%xmm5,%xmm5 + vpxor %xmm1,%xmm5,%xmm5 + + +.Lghash_done: + + vpshufb %xmm6,%xmm5,%xmm5 + vmovdqu %xmm5,(%rdi) + + vzeroupper + RET + +.cfi_endproc +SET_SIZE(gcm_ghash_vpclmulqdq_avx2) +ENTRY_ALIGN(aes_gcm_enc_update_vaes_avx2, 32) +.cfi_startproc + +ENDBR + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-16 + + movq 16(%rsp),%r12 +#ifdef BORINGSSL_DISPATCH_TEST +.extern BORINGSSL_function_hit +.hidden BORINGSSL_function_hit + movb $1,BORINGSSL_function_hit+6(%rip) +#endif + vbroadcasti128 .Lbswap_mask(%rip),%ymm0 + + + + vmovdqu (%r12),%xmm1 + vpshufb %xmm0,%xmm1,%xmm1 + vbroadcasti128 (%r8),%ymm11 + vpshufb %ymm0,%ymm11,%ymm11 + + + + movl 504(%rcx),%r10d // ICP has a larger offset for rounds. + leal -24(,%r10,4),%r10d // ICP uses 10,12,14 not 9,11,13 for rounds. + + + + + leaq 96(%rcx,%r10,4),%r11 + vbroadcasti128 (%rcx),%ymm9 + vbroadcasti128 (%r11),%ymm10 + + + vpaddd .Lctr_pattern(%rip),%ymm11,%ymm11 + + + + cmpq $127,%rdx + jbe .Lcrypt_loop_4x_done__func1 + + vmovdqu 128(%r9),%ymm7 + vmovdqu 128+32(%r9),%ymm8 + + + + vmovdqu .Linc_2blocks(%rip),%ymm2 + vpshufb %ymm0,%ymm11,%ymm12 + vpaddd %ymm2,%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm13 + vpaddd %ymm2,%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm14 + vpaddd %ymm2,%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm15 + vpaddd %ymm2,%ymm11,%ymm11 + + + vpxor %ymm9,%ymm12,%ymm12 + vpxor %ymm9,%ymm13,%ymm13 + vpxor %ymm9,%ymm14,%ymm14 + vpxor %ymm9,%ymm15,%ymm15 + + leaq 16(%rcx),%rax +.Lvaesenc_loop_first_4_vecs__func1: + vbroadcasti128 (%rax),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + addq $16,%rax + cmpq %rax,%r11 + jne .Lvaesenc_loop_first_4_vecs__func1 + vpxor 0(%rdi),%ymm10,%ymm2 + vpxor 32(%rdi),%ymm10,%ymm3 + vpxor 64(%rdi),%ymm10,%ymm5 + vpxor 96(%rdi),%ymm10,%ymm6 + vaesenclast %ymm2,%ymm12,%ymm12 + vaesenclast %ymm3,%ymm13,%ymm13 + vaesenclast %ymm5,%ymm14,%ymm14 + vaesenclast %ymm6,%ymm15,%ymm15 + vmovdqu %ymm12,0(%rsi) + vmovdqu %ymm13,32(%rsi) + vmovdqu %ymm14,64(%rsi) + vmovdqu %ymm15,96(%rsi) + + subq $-128,%rdi + addq $-128,%rdx + cmpq $127,%rdx + jbe .Lghash_last_ciphertext_4x__func1 +.balign 16 +.Lcrypt_loop_4x__func1: + + + + + vmovdqu .Linc_2blocks(%rip),%ymm2 + vpshufb %ymm0,%ymm11,%ymm12 + vpaddd %ymm2,%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm13 + vpaddd %ymm2,%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm14 + vpaddd %ymm2,%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm15 + vpaddd %ymm2,%ymm11,%ymm11 + + + vpxor %ymm9,%ymm12,%ymm12 + vpxor %ymm9,%ymm13,%ymm13 + vpxor %ymm9,%ymm14,%ymm14 + vpxor %ymm9,%ymm15,%ymm15 + + cmpl $24,%r10d + jl .Laes128__func1 + je .Laes192__func1 + + vbroadcasti128 -208(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + vbroadcasti128 -192(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + +.Laes192__func1: + vbroadcasti128 -176(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + vbroadcasti128 -160(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + +.Laes128__func1: + prefetcht0 512(%rdi) + prefetcht0 512+64(%rdi) + + vmovdqu 0(%rsi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 0(%r9),%ymm4 + vpxor %ymm1,%ymm3,%ymm3 + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm5 + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm1 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x00,%ymm7,%ymm2,%ymm6 + + vbroadcasti128 -144(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + + vbroadcasti128 -128(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + + vmovdqu 32(%rsi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 32(%r9),%ymm4 + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm5,%ymm5 + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x10,%ymm7,%ymm2,%ymm2 + vpxor %ymm2,%ymm6,%ymm6 + + vbroadcasti128 -112(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + + vmovdqu 64(%rsi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 64(%r9),%ymm4 + + vbroadcasti128 -96(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm5,%ymm5 + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + + vbroadcasti128 -80(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x00,%ymm8,%ymm2,%ymm2 + vpxor %ymm2,%ymm6,%ymm6 + + + vmovdqu 96(%rsi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + + vbroadcasti128 -64(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + vmovdqu 96(%r9),%ymm4 + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm5,%ymm5 + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x10,%ymm8,%ymm2,%ymm2 + vpxor %ymm2,%ymm6,%ymm6 + + vbroadcasti128 -48(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + + vpxor %ymm5,%ymm6,%ymm6 + vpxor %ymm1,%ymm6,%ymm6 + + + vbroadcasti128 .Lgfpoly(%rip),%ymm4 + vpclmulqdq $0x01,%ymm5,%ymm4,%ymm2 + vpshufd $0x4e,%ymm5,%ymm5 + vpxor %ymm5,%ymm6,%ymm6 + vpxor %ymm2,%ymm6,%ymm6 + + vbroadcasti128 -32(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + + vpclmulqdq $0x01,%ymm6,%ymm4,%ymm2 + vpshufd $0x4e,%ymm6,%ymm6 + vpxor %ymm6,%ymm1,%ymm1 + vpxor %ymm2,%ymm1,%ymm1 + + vbroadcasti128 -16(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + vextracti128 $1,%ymm1,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 + + + subq $-128,%rsi + vpxor 0(%rdi),%ymm10,%ymm2 + vpxor 32(%rdi),%ymm10,%ymm3 + vpxor 64(%rdi),%ymm10,%ymm5 + vpxor 96(%rdi),%ymm10,%ymm6 + vaesenclast %ymm2,%ymm12,%ymm12 + vaesenclast %ymm3,%ymm13,%ymm13 + vaesenclast %ymm5,%ymm14,%ymm14 + vaesenclast %ymm6,%ymm15,%ymm15 + vmovdqu %ymm12,0(%rsi) + vmovdqu %ymm13,32(%rsi) + vmovdqu %ymm14,64(%rsi) + vmovdqu %ymm15,96(%rsi) + + subq $-128,%rdi + + addq $-128,%rdx + cmpq $127,%rdx + ja .Lcrypt_loop_4x__func1 +.Lghash_last_ciphertext_4x__func1: + + vmovdqu 0(%rsi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 0(%r9),%ymm4 + vpxor %ymm1,%ymm3,%ymm3 + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm5 + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm1 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x00,%ymm7,%ymm2,%ymm6 + + vmovdqu 32(%rsi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 32(%r9),%ymm4 + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm5,%ymm5 + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x10,%ymm7,%ymm2,%ymm2 + vpxor %ymm2,%ymm6,%ymm6 + + vmovdqu 64(%rsi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 64(%r9),%ymm4 + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm5,%ymm5 + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x00,%ymm8,%ymm2,%ymm2 + vpxor %ymm2,%ymm6,%ymm6 + + + vmovdqu 96(%rsi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 96(%r9),%ymm4 + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm5,%ymm5 + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x10,%ymm8,%ymm2,%ymm2 + vpxor %ymm2,%ymm6,%ymm6 + + vpxor %ymm5,%ymm6,%ymm6 + vpxor %ymm1,%ymm6,%ymm6 + + + vbroadcasti128 .Lgfpoly(%rip),%ymm4 + vpclmulqdq $0x01,%ymm5,%ymm4,%ymm2 + vpshufd $0x4e,%ymm5,%ymm5 + vpxor %ymm5,%ymm6,%ymm6 + vpxor %ymm2,%ymm6,%ymm6 + + vpclmulqdq $0x01,%ymm6,%ymm4,%ymm2 + vpshufd $0x4e,%ymm6,%ymm6 + vpxor %ymm6,%ymm1,%ymm1 + vpxor %ymm2,%ymm1,%ymm1 + vextracti128 $1,%ymm1,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 + + subq $-128,%rsi +.Lcrypt_loop_4x_done__func1: + + testq %rdx,%rdx + jz .Ldone__func1 + + + + + + leaq 128(%r9),%r8 + subq %rdx,%r8 + + + vpxor %xmm5,%xmm5,%xmm5 + vpxor %xmm6,%xmm6,%xmm6 + vpxor %xmm7,%xmm7,%xmm7 + + cmpq $64,%rdx + jb .Llessthan64bytes__func1 + + + vpshufb %ymm0,%ymm11,%ymm12 + vpaddd .Linc_2blocks(%rip),%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm13 + vpaddd .Linc_2blocks(%rip),%ymm11,%ymm11 + vpxor %ymm9,%ymm12,%ymm12 + vpxor %ymm9,%ymm13,%ymm13 + leaq 16(%rcx),%rax +.Lvaesenc_loop_tail_1__func1: + vbroadcasti128 (%rax),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + addq $16,%rax + cmpq %rax,%r11 + jne .Lvaesenc_loop_tail_1__func1 + vaesenclast %ymm10,%ymm12,%ymm12 + vaesenclast %ymm10,%ymm13,%ymm13 + + + vmovdqu 0(%rdi),%ymm2 + vmovdqu 32(%rdi),%ymm3 + vpxor %ymm2,%ymm12,%ymm12 + vpxor %ymm3,%ymm13,%ymm13 + vmovdqu %ymm12,0(%rsi) + vmovdqu %ymm13,32(%rsi) + + + vpshufb %ymm0,%ymm12,%ymm12 + vpshufb %ymm0,%ymm13,%ymm13 + vpxor %ymm1,%ymm12,%ymm12 + vmovdqu (%r8),%ymm2 + vmovdqu 32(%r8),%ymm3 + vpclmulqdq $0x00,%ymm2,%ymm12,%ymm5 + vpclmulqdq $0x01,%ymm2,%ymm12,%ymm6 + vpclmulqdq $0x10,%ymm2,%ymm12,%ymm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x11,%ymm2,%ymm12,%ymm7 + vpclmulqdq $0x00,%ymm3,%ymm13,%ymm4 + vpxor %ymm4,%ymm5,%ymm5 + vpclmulqdq $0x01,%ymm3,%ymm13,%ymm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x10,%ymm3,%ymm13,%ymm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x11,%ymm3,%ymm13,%ymm4 + vpxor %ymm4,%ymm7,%ymm7 + + addq $64,%r8 + addq $64,%rdi + addq $64,%rsi + subq $64,%rdx + jz .Lreduce__func1 + + vpxor %xmm1,%xmm1,%xmm1 + + +.Llessthan64bytes__func1: + vpshufb %ymm0,%ymm11,%ymm12 + vpaddd .Linc_2blocks(%rip),%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm13 + vpxor %ymm9,%ymm12,%ymm12 + vpxor %ymm9,%ymm13,%ymm13 + leaq 16(%rcx),%rax +.Lvaesenc_loop_tail_2__func1: + vbroadcasti128 (%rax),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + addq $16,%rax + cmpq %rax,%r11 + jne .Lvaesenc_loop_tail_2__func1 + vaesenclast %ymm10,%ymm12,%ymm12 + vaesenclast %ymm10,%ymm13,%ymm13 + + + + + cmpq $32,%rdx + jb .Lxor_one_block__func1 + je .Lxor_two_blocks__func1 + +.Lxor_three_blocks__func1: + vmovdqu 0(%rdi),%ymm2 + vmovdqu 32(%rdi),%xmm3 + vpxor %ymm2,%ymm12,%ymm12 + vpxor %xmm3,%xmm13,%xmm13 + vmovdqu %ymm12,0(%rsi) + vmovdqu %xmm13,32(%rsi) + + vpshufb %ymm0,%ymm12,%ymm12 + vpshufb %xmm0,%xmm13,%xmm13 + vpxor %ymm1,%ymm12,%ymm12 + vmovdqu (%r8),%ymm2 + vmovdqu 32(%r8),%xmm3 + vpclmulqdq $0x00,%xmm3,%xmm13,%xmm4 + vpxor %ymm4,%ymm5,%ymm5 + vpclmulqdq $0x01,%xmm3,%xmm13,%xmm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x10,%xmm3,%xmm13,%xmm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x11,%xmm3,%xmm13,%xmm4 + vpxor %ymm4,%ymm7,%ymm7 + jmp .Lghash_mul_one_vec_unreduced__func1 + +.Lxor_two_blocks__func1: + vmovdqu (%rdi),%ymm2 + vpxor %ymm2,%ymm12,%ymm12 + vmovdqu %ymm12,(%rsi) + vpshufb %ymm0,%ymm12,%ymm12 + vpxor %ymm1,%ymm12,%ymm12 + vmovdqu (%r8),%ymm2 + jmp .Lghash_mul_one_vec_unreduced__func1 + +.Lxor_one_block__func1: + vmovdqu (%rdi),%xmm2 + vpxor %xmm2,%xmm12,%xmm12 + vmovdqu %xmm12,(%rsi) + vpshufb %xmm0,%xmm12,%xmm12 + vpxor %xmm1,%xmm12,%xmm12 + vmovdqu (%r8),%xmm2 + +.Lghash_mul_one_vec_unreduced__func1: + vpclmulqdq $0x00,%ymm2,%ymm12,%ymm4 + vpxor %ymm4,%ymm5,%ymm5 + vpclmulqdq $0x01,%ymm2,%ymm12,%ymm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x10,%ymm2,%ymm12,%ymm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x11,%ymm2,%ymm12,%ymm4 + vpxor %ymm4,%ymm7,%ymm7 + +.Lreduce__func1: + + vbroadcasti128 .Lgfpoly(%rip),%ymm2 + vpclmulqdq $0x01,%ymm5,%ymm2,%ymm3 + vpshufd $0x4e,%ymm5,%ymm5 + vpxor %ymm5,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpclmulqdq $0x01,%ymm6,%ymm2,%ymm3 + vpshufd $0x4e,%ymm6,%ymm6 + vpxor %ymm6,%ymm7,%ymm7 + vpxor %ymm3,%ymm7,%ymm7 + vextracti128 $1,%ymm7,%xmm1 + vpxor %xmm7,%xmm1,%xmm1 + +.Ldone__func1: + + vpshufb %xmm0,%xmm1,%xmm1 + vmovdqu %xmm1,(%r12) + + vzeroupper + popq %r12 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r12 + RET + +.cfi_endproc +SET_SIZE(aes_gcm_enc_update_vaes_avx2) +ENTRY_ALIGN(aes_gcm_dec_update_vaes_avx2, 32) +.cfi_startproc + +ENDBR + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-16 + + movq 16(%rsp),%r12 + vbroadcasti128 .Lbswap_mask(%rip),%ymm0 + + + + vmovdqu (%r12),%xmm1 + vpshufb %xmm0,%xmm1,%xmm1 + vbroadcasti128 (%r8),%ymm11 + vpshufb %ymm0,%ymm11,%ymm11 + + + + movl 504(%rcx),%r10d // ICP has a larger offset for rounds. + leal -24(,%r10,4),%r10d // ICP uses 10,12,14 not 9,11,13 for rounds. + + + + + leaq 96(%rcx,%r10,4),%r11 + vbroadcasti128 (%rcx),%ymm9 + vbroadcasti128 (%r11),%ymm10 + + + vpaddd .Lctr_pattern(%rip),%ymm11,%ymm11 + + + + cmpq $127,%rdx + jbe .Lcrypt_loop_4x_done__func2 + + vmovdqu 128(%r9),%ymm7 + vmovdqu 128+32(%r9),%ymm8 +.balign 16 +.Lcrypt_loop_4x__func2: + + + + + vmovdqu .Linc_2blocks(%rip),%ymm2 + vpshufb %ymm0,%ymm11,%ymm12 + vpaddd %ymm2,%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm13 + vpaddd %ymm2,%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm14 + vpaddd %ymm2,%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm15 + vpaddd %ymm2,%ymm11,%ymm11 + + + vpxor %ymm9,%ymm12,%ymm12 + vpxor %ymm9,%ymm13,%ymm13 + vpxor %ymm9,%ymm14,%ymm14 + vpxor %ymm9,%ymm15,%ymm15 + + cmpl $24,%r10d + jl .Laes128__func2 + je .Laes192__func2 + + vbroadcasti128 -208(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + vbroadcasti128 -192(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + +.Laes192__func2: + vbroadcasti128 -176(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + vbroadcasti128 -160(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + +.Laes128__func2: + prefetcht0 512(%rdi) + prefetcht0 512+64(%rdi) + + vmovdqu 0(%rdi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 0(%r9),%ymm4 + vpxor %ymm1,%ymm3,%ymm3 + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm5 + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm1 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x00,%ymm7,%ymm2,%ymm6 + + vbroadcasti128 -144(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + + vbroadcasti128 -128(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + + vmovdqu 32(%rdi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 32(%r9),%ymm4 + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm5,%ymm5 + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x10,%ymm7,%ymm2,%ymm2 + vpxor %ymm2,%ymm6,%ymm6 + + vbroadcasti128 -112(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + + vmovdqu 64(%rdi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 64(%r9),%ymm4 + + vbroadcasti128 -96(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm5,%ymm5 + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + + vbroadcasti128 -80(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x00,%ymm8,%ymm2,%ymm2 + vpxor %ymm2,%ymm6,%ymm6 + + + vmovdqu 96(%rdi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + + vbroadcasti128 -64(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + vmovdqu 96(%r9),%ymm4 + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm5,%ymm5 + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x10,%ymm8,%ymm2,%ymm2 + vpxor %ymm2,%ymm6,%ymm6 + + vbroadcasti128 -48(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + + vpxor %ymm5,%ymm6,%ymm6 + vpxor %ymm1,%ymm6,%ymm6 + + + vbroadcasti128 .Lgfpoly(%rip),%ymm4 + vpclmulqdq $0x01,%ymm5,%ymm4,%ymm2 + vpshufd $0x4e,%ymm5,%ymm5 + vpxor %ymm5,%ymm6,%ymm6 + vpxor %ymm2,%ymm6,%ymm6 + + vbroadcasti128 -32(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + + vpclmulqdq $0x01,%ymm6,%ymm4,%ymm2 + vpshufd $0x4e,%ymm6,%ymm6 + vpxor %ymm6,%ymm1,%ymm1 + vpxor %ymm2,%ymm1,%ymm1 + + vbroadcasti128 -16(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + vextracti128 $1,%ymm1,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 + + + + vpxor 0(%rdi),%ymm10,%ymm2 + vpxor 32(%rdi),%ymm10,%ymm3 + vpxor 64(%rdi),%ymm10,%ymm5 + vpxor 96(%rdi),%ymm10,%ymm6 + vaesenclast %ymm2,%ymm12,%ymm12 + vaesenclast %ymm3,%ymm13,%ymm13 + vaesenclast %ymm5,%ymm14,%ymm14 + vaesenclast %ymm6,%ymm15,%ymm15 + vmovdqu %ymm12,0(%rsi) + vmovdqu %ymm13,32(%rsi) + vmovdqu %ymm14,64(%rsi) + vmovdqu %ymm15,96(%rsi) + + subq $-128,%rdi + subq $-128,%rsi + addq $-128,%rdx + cmpq $127,%rdx + ja .Lcrypt_loop_4x__func2 +.Lcrypt_loop_4x_done__func2: + + testq %rdx,%rdx + jz .Ldone__func2 + + + + + + leaq 128(%r9),%r8 + subq %rdx,%r8 + + + vpxor %xmm5,%xmm5,%xmm5 + vpxor %xmm6,%xmm6,%xmm6 + vpxor %xmm7,%xmm7,%xmm7 + + cmpq $64,%rdx + jb .Llessthan64bytes__func2 + + + vpshufb %ymm0,%ymm11,%ymm12 + vpaddd .Linc_2blocks(%rip),%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm13 + vpaddd .Linc_2blocks(%rip),%ymm11,%ymm11 + vpxor %ymm9,%ymm12,%ymm12 + vpxor %ymm9,%ymm13,%ymm13 + leaq 16(%rcx),%rax +.Lvaesenc_loop_tail_1__func2: + vbroadcasti128 (%rax),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + addq $16,%rax + cmpq %rax,%r11 + jne .Lvaesenc_loop_tail_1__func2 + vaesenclast %ymm10,%ymm12,%ymm12 + vaesenclast %ymm10,%ymm13,%ymm13 + + + vmovdqu 0(%rdi),%ymm2 + vmovdqu 32(%rdi),%ymm3 + vpxor %ymm2,%ymm12,%ymm12 + vpxor %ymm3,%ymm13,%ymm13 + vmovdqu %ymm12,0(%rsi) + vmovdqu %ymm13,32(%rsi) + + + vpshufb %ymm0,%ymm2,%ymm12 + vpshufb %ymm0,%ymm3,%ymm13 + vpxor %ymm1,%ymm12,%ymm12 + vmovdqu (%r8),%ymm2 + vmovdqu 32(%r8),%ymm3 + vpclmulqdq $0x00,%ymm2,%ymm12,%ymm5 + vpclmulqdq $0x01,%ymm2,%ymm12,%ymm6 + vpclmulqdq $0x10,%ymm2,%ymm12,%ymm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x11,%ymm2,%ymm12,%ymm7 + vpclmulqdq $0x00,%ymm3,%ymm13,%ymm4 + vpxor %ymm4,%ymm5,%ymm5 + vpclmulqdq $0x01,%ymm3,%ymm13,%ymm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x10,%ymm3,%ymm13,%ymm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x11,%ymm3,%ymm13,%ymm4 + vpxor %ymm4,%ymm7,%ymm7 + + addq $64,%r8 + addq $64,%rdi + addq $64,%rsi + subq $64,%rdx + jz .Lreduce__func2 + + vpxor %xmm1,%xmm1,%xmm1 + + +.Llessthan64bytes__func2: + vpshufb %ymm0,%ymm11,%ymm12 + vpaddd .Linc_2blocks(%rip),%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm13 + vpxor %ymm9,%ymm12,%ymm12 + vpxor %ymm9,%ymm13,%ymm13 + leaq 16(%rcx),%rax +.Lvaesenc_loop_tail_2__func2: + vbroadcasti128 (%rax),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + addq $16,%rax + cmpq %rax,%r11 + jne .Lvaesenc_loop_tail_2__func2 + vaesenclast %ymm10,%ymm12,%ymm12 + vaesenclast %ymm10,%ymm13,%ymm13 + + + + + cmpq $32,%rdx + jb .Lxor_one_block__func2 + je .Lxor_two_blocks__func2 + +.Lxor_three_blocks__func2: + vmovdqu 0(%rdi),%ymm2 + vmovdqu 32(%rdi),%xmm3 + vpxor %ymm2,%ymm12,%ymm12 + vpxor %xmm3,%xmm13,%xmm13 + vmovdqu %ymm12,0(%rsi) + vmovdqu %xmm13,32(%rsi) + + vpshufb %ymm0,%ymm2,%ymm12 + vpshufb %xmm0,%xmm3,%xmm13 + vpxor %ymm1,%ymm12,%ymm12 + vmovdqu (%r8),%ymm2 + vmovdqu 32(%r8),%xmm3 + vpclmulqdq $0x00,%xmm3,%xmm13,%xmm4 + vpxor %ymm4,%ymm5,%ymm5 + vpclmulqdq $0x01,%xmm3,%xmm13,%xmm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x10,%xmm3,%xmm13,%xmm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x11,%xmm3,%xmm13,%xmm4 + vpxor %ymm4,%ymm7,%ymm7 + jmp .Lghash_mul_one_vec_unreduced__func2 + +.Lxor_two_blocks__func2: + vmovdqu (%rdi),%ymm2 + vpxor %ymm2,%ymm12,%ymm12 + vmovdqu %ymm12,(%rsi) + vpshufb %ymm0,%ymm2,%ymm12 + vpxor %ymm1,%ymm12,%ymm12 + vmovdqu (%r8),%ymm2 + jmp .Lghash_mul_one_vec_unreduced__func2 + +.Lxor_one_block__func2: + vmovdqu (%rdi),%xmm2 + vpxor %xmm2,%xmm12,%xmm12 + vmovdqu %xmm12,(%rsi) + vpshufb %xmm0,%xmm2,%xmm12 + vpxor %xmm1,%xmm12,%xmm12 + vmovdqu (%r8),%xmm2 + +.Lghash_mul_one_vec_unreduced__func2: + vpclmulqdq $0x00,%ymm2,%ymm12,%ymm4 + vpxor %ymm4,%ymm5,%ymm5 + vpclmulqdq $0x01,%ymm2,%ymm12,%ymm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x10,%ymm2,%ymm12,%ymm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x11,%ymm2,%ymm12,%ymm4 + vpxor %ymm4,%ymm7,%ymm7 + +.Lreduce__func2: + + vbroadcasti128 .Lgfpoly(%rip),%ymm2 + vpclmulqdq $0x01,%ymm5,%ymm2,%ymm3 + vpshufd $0x4e,%ymm5,%ymm5 + vpxor %ymm5,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpclmulqdq $0x01,%ymm6,%ymm2,%ymm3 + vpshufd $0x4e,%ymm6,%ymm6 + vpxor %ymm6,%ymm7,%ymm7 + vpxor %ymm3,%ymm7,%ymm7 + vextracti128 $1,%ymm7,%xmm1 + vpxor %xmm7,%xmm1,%xmm1 + +.Ldone__func2: + + vpshufb %xmm0,%xmm1,%xmm1 + vmovdqu %xmm1,(%r12) + + vzeroupper + popq %r12 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r12 + RET + +.cfi_endproc +SET_SIZE(aes_gcm_dec_update_vaes_avx2) + +#endif /* !_WIN32 || _KERNEL */ + +/* Mark the stack non-executable. */ +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif + +#endif /* defined(__x86_64__) && defined(HAVE_AVX) && defined(HAVE_AES) ... */ |
